In [None]:
import os
import zipfile
import pandas as pd
import requests
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

In [None]:
file_url = "https://static.nhtsa.gov/odi/ffdd/cmpl/COMPLAINTS_RECEIVED_2015-2019.zip"

DOWNLOAD_DIR = "downloads"
EXTRACT_DIR = "extracted"
DATA_DIR = "data"
DOWNLOADED_DATA = "raw_complaints.csv"

In [None]:
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(EXTRACT_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
def download_file(file_url):
    file_name = file_url.split("/")[-1]
    file_path = os.path.join(DOWNLOAD_DIR, file_name)

    print(f"Downloading {file_name}...")
    response = requests.get(file_url)
    if response.status_code == 200:
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded: {file_name}")
        return file_path
    else:
        print(f"Failed to download {file_name}. Status: {response.status_code}")
        return None

In [None]:
def extract_file(file_path):
    print(f"Extracting {file_path}...")
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(EXTRACT_DIR)
        print(f"Extraction completed.")
    except zipfile.BadZipFile as e:
        print(f"Failed to extract {file_path}: {e}")

In [None]:
def process_txt_file(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' does not exist.")
        return

    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    output_path = os.path.join(DATA_DIR, DOWNLOADED_DATA)

    print(f"Processing {file_path}...")

    try:
        df = pd.read_csv(file_path, delimiter="\t", low_memory=False)
        df.to_csv(output_path, index=False)
        print(f"File converted and saved as {output_path}")
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")

# Downloading dataset

In [None]:
downloaded_file = download_file(file_url)
if downloaded_file:
    extract_file(downloaded_file)
    process_txt_file(downloaded_file)

# Add the missing header to the dataset

In [None]:
header = [
    "CMPLID", "ODINO", "MFR_NAME", "MAKETXT", "MODELTXT", "YEARTXT", "CRASH",
    "FAILDATE", "FIRE", "INJURED", "DEATHS", "COMPDESC", "CITY", "STATE", "VIN",
    "DATEA", "LDATE", "MILES", "OCCURENCES", "CDESCR", "CMPL_TYPE",
    "POLICE_RPT_YN", "PURCH_DT", "ORIG_OWNER_YN", "ANTI_BRAKES_YN",
    "CRUISE_CONT_YN", "NUM_CYLS", "DRIVE_TRAIN", "FUEL_SYS", "FUEL_TYPE",
    "TRANS_TYPE", "VEH_SPEED", "DOT", "TIRE_SIZE", "LOC_OF_TIRE", "TIRE_FAIL_TYPE",
    "ORIG_EQUIP_YN", "MANUF_DT", "SEAT_TYPE", "RESTRAINT_TYPE", "DEALER_NAME",
    "DEALER_TEL", "DEALER_CITY", "DEALER_STATE", "DEALER_ZIP", "PROD_TYPE",
    "REPAIRED_YN", "MEDICAL_ATTN", "VEHICLES_TOWED_YN"
]

In [None]:
raw_data_path = "data/raw_complaints.csv"
raw_data = pd.read_csv(raw_data_path, header=None, low_memory=False)

raw_data.columns = header
raw_data.columns = header

output_file = "data\complaints_with_header.csv"
raw_data.to_csv(output_file, index=False)

print(f"Header applied successfully. File saved as {output_file}.")

# Remove unused columns

In [None]:
input_file = "data/complaints_with_header.csv"
output_file = "data/complains_treated.csv"
threshold = 0.8
manual_columns_to_remove = ["VIN", "PROD_TYPE", "POLICE_RPT_YN", "VEHICLES_TOWED_YN", "ANTI_BRAKES_YN", "CRUISE_CONT_YN"]

In [None]:
df_with_columns = pd.read_csv(input_file, low_memory=False)

non_null_percentage = df_with_columns.notnull().mean()
columns_to_keep = non_null_percentage[non_null_percentage >= threshold].index
columns_to_remove = set(manual_columns_to_remove)
columns_to_keep = [col for col in columns_to_keep if col not in columns_to_remove]

In [None]:
df_with_columns = df_with_columns[columns_to_keep]
df_with_columns.to_csv(output_file, index=False)
df_with_columns.info()

# Treat data

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df = pd.read_csv("data/complains_treated.csv", low_memory=False)

In [None]:
def preprocess_text(text, column_name, columns_to_keep_numbers):
    if not isinstance(text, str):
        return text
    
    # Step 1: Remove numbers and non-letter characters (for columns that do not need to keep numbers)
    if column_name in columns_to_keep_numbers:
        # Remove non-alphanumeric characters (but keep periods, numbers, and letters)
        text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)
    else:
        # For other columns, remove everything except letters and spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase and clean up excess whitespace
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Step 2: Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

In [None]:
columns_to_keep_numbers = ['MODELTXT', 'MAKETXT', 'MFR_NAME', 'DATEA', 'LDATE']
columns_to_encode = ['CRASH', 'FIRE', 'MEDICAL_ATTN', 'ORIG_OWNER_YN']

In [None]:
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

In [None]:
text_columns = df.select_dtypes(include=['object']).columns

new_df = df.copy()

for col in text_columns:
    new_df[col] = new_df[col].apply(lambda x: preprocess_text(x, col, columns_to_keep_numbers))

In [None]:
float_columns = new_df.select_dtypes(include=['float']).columns
for col in float_columns:
    new_df[col] = new_df[col].apply(lambda x: int(x) if pd.notnull(x) else 0)  # Handle NaN values by converting to 0

print(new_df.describe())
print(new_df.info())

new_df.to_csv("data/complains_preprocessed.csv", index=False)