# Combining Complaints and Recall for Information Retrieval

**Author:** Harris Zheng

**Date:** March 2nd, 2025

TODO: Add remainder columns

# Import Packages

In [9]:
import pandas as pd
import pprint
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import os
import string
import re


# Utilities

In [10]:
def fill_string_spaces(df : pd.DataFrame):
    # Fill null string columns in DataFrame
    for column in df.columns:
        if df[column].dtype == object:
            df[column] = df[column].str.replace("\s+", " ", regex=True)

In [48]:
def fill_string_nulls(df : pd.DataFrame):
    # Fill null string columns in DataFrame
    for column in df.columns:
        if df[column].dtype == object:
            df[column] = df[column].fillna("")

In [49]:
def trim_strings(df : pd.DataFrame):
    # Fill null string columns in DataFrame
    for column in df.columns:
        if df[column].dtype == object:
            df[column] = df[column].str.strip()

In [50]:
def find_duplicate_and_non_duplicate_columns(df : pd.DataFrame, 
                                             column_defining_uniqueness : str):
    '''
    input: dataframe, and unique column identifier
    returns: duplicated columns and non-duplicated columns
    '''
    column_uniqueness = (
        df.groupby(column_defining_uniqueness)
        .nunique().sum(axis=0) 
        - 
        len(df[column_defining_uniqueness].unique())
    )
    duplicated_columns = column_uniqueness.loc[column_uniqueness > 0].index
    non_duplicated_columns = set(df.columns) - set(duplicated_columns)

    return list(duplicated_columns), list(non_duplicated_columns)

# Ingest Data

## Recall

In [15]:
PARENT_DIR = os.getcwd().rsplit("\\", maxsplit=1)[0]
DATASET_DIR = os.path.join(PARENT_DIR, "Datasets")

In [16]:
df_recall = pd.read_csv(f"{DATASET_DIR}/FLAT_RCL.txt", sep='\t', header=None, on_bad_lines='skip')
# use the column names listed above
df_recall.columns = ['RECORD_ID', 'CAMPNO', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'MFGCAMPNO', 'COMPNAME', 'MFGNAME', 'BGMAN', 'ENDMAN', 'RCLTYPECD', 'POTAFF', 'ODATE', 'INFLUENCED_BY', 'MFGTXT', 'RCDATE', 'DATEA', 'RPNO', 'FMVSS', 'DESC_DEFECT', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION', 'NOTES', 'RCL_CMPT_ID', 'MFR_COMP_NAME', 'MFR_COMP_DESC', 'MFR_COMP_PTNO']
df_recall.head()

  df_recall = pd.read_csv(f"{DATASET_DIR}/FLAT_RCL.txt", sep='\t', header=None, on_bad_lines='skip')


Unnamed: 0,RECORD_ID,CAMPNO,MAKETXT,MODELTXT,YEARTXT,MFGCAMPNO,COMPNAME,MFGNAME,BGMAN,ENDMAN,...,RPNO,FMVSS,DESC_DEFECT,CONSEQUENCE_DEFECT,CORRECTIVE_ACTION,NOTES,RCL_CMPT_ID,MFR_COMP_NAME,MFR_COMP_DESC,MFR_COMP_PTNO
0,1,02V288000,FORD,FOCUS,2000,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215021000000202,,,
1,2,02V288000,FORD,FOCUS,2001,02S41,ELECTRICAL SYSTEM:12V/24V/48V BATTERY:CABLES,FORD MOTOR COMPANY,19990719.0,20010531.0,...,,,CERTAIN PASSENGER VEHICLES EQUIPPED WITH ZETEC...,"THIS, IN TURN, COULD CAUSE THE BATTERY CABLES ...",DEALERS WILL INSPECT THE BATTERY CABLES FOR TH...,ALSO CONTACT THE NATIONAL HIGHWAY TRAFFIC SAFE...,000015339000215022000000202,,,
2,3,02V236000,JAYCO,FT EAGLE 10 SG,2003,,EQUIPMENT:OTHER:LABELS,"JAYCO, INC.",20020730.0,20020813.0,...,,,"ON CERTAIN FOLDING TENT CAMPERS, THE FEDERAL C...","IF THE TIRES WERE INFLATED TO 80 PSI, THEY COU...",OWNERS WILL BE MAILED CORRECT LABELS FOR INSTA...,"ALSO, CUSTOMERS CAN CONTACT THE NATIONAL HIGHW...",000015210000106403000000349,,,
3,4,02V237000,HOLIDAY RAMBLER,ENDEAVOR,2000,,STRUCTURE,MONACO COACH CORP.,,,...,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000083965000000272,,,
4,5,02V237000,HOLIDAY RAMBLER,ENDEAVOR,1999,,STRUCTURE,MONACO COACH CORP.,,,...,,,"ON CERTAIN CLASS A MOTOR HOMES, THE FLOOR TRUS...",CONDITIONS CAN RESULT IN THE BOTTOMING OUT THE...,DEALERS WILL INSPECT THE FLOOR TRUSS NETWORK S...,CUSTOMERS CAN ALSO CONTACT THE NATIONAL HIGHWA...,000015211000080938000000272,,,


## Complaints
2025 data

In [17]:
df_complaints = pd.read_csv(f"{DATASET_DIR}/COMPLAINTS_RECEIVED_2025-2025.txt", 
                            sep='\t', 
                            header=None, 
                            index_col=0)
df_complaints.columns = ['ODINO', 'MFR_NAME', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'CRASH', 'FAILDATE', 'FIRE', 'INJURED', 'DEATHS', 'COMPDESC', 'CITY', 'STATE', 'VIN', 'DATEA', 'LDATE', 'MILES', 'OCCURENCES', 'CDESCR', 'CMPL_TYPE', 'POLICE_RPT_YN', 'PURCH_DT', 'ORIG_OWNER_YN', 'ANTI_BRAKES_YN', 'CRUISE_CONT_YN', 'NUM_CYLS', 'DRIVE_TRAIN', 'FUEL_SYS', 'FUEL_TYPE',
              'TRANS_TYPE', 'VEH_SPEED', 'DOT', 'TIRE_SIZE', 'LOC_OF_TIRE', 'TIRE_FAIL_TYPE', 'ORIG_EQUIP_YN', 'MANUF_DT', 'SEAT_TYPE', 'RESTRAINT_TYPE', 'DEALER_NAME', 'DEALER_TEL', 'DEALER_CITY', 'DEALER_STATE', 'DEALER_ZIP', 'PROD_TYPE', 'REPAIRED_YN', 'MEDICAL_ATTN', 'VEHICLES_TOWED_YN']

  df_complaints = pd.read_csv(f"{DATASET_DIR}/COMPLAINTS_RECEIVED_2025-2025.txt",


In [18]:
len(df_complaints)

17239

In [19]:
df_complaints.head()

Unnamed: 0_level_0,ODINO,MFR_NAME,MAKETXT,MODELTXT,YEARTXT,CRASH,FAILDATE,FIRE,INJURED,DEATHS,...,RESTRAINT_TYPE,DEALER_NAME,DEALER_TEL,DEALER_CITY,DEALER_STATE,DEALER_ZIP,PROD_TYPE,REPAIRED_YN,MEDICAL_ATTN,VEHICLES_TOWED_YN
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2051723,11633472,Ford Motor Company,FORD,EXPLORER,2016,N,20240701,N,0,0,...,,,,,,,V,,N,N
2051724,11633473,"Chrysler (FCA US, LLC)",JEEP,GLADIATOR,2022,N,20241231,N,0,0,...,,,,,,,V,,N,N
2051725,11633474,Toyota Motor Corporation,TOYOTA,TUNDRA,2024,N,20241101,N,0,0,...,,,,,,,V,,N,N
2051726,11633475,"General Motors, LLC",BUICK,ENVISTA,2024,N,20241229,N,0,0,...,,,,,,,V,,N,N
2051727,11633475,"General Motors, LLC",BUICK,ENVISTA,2024,N,20241229,N,0,0,...,,,,,,,V,,N,N


## Column Intersection Between Recall and Complaints

In [20]:
df_recall.columns

Index(['RECORD_ID', 'CAMPNO', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'MFGCAMPNO',
       'COMPNAME', 'MFGNAME', 'BGMAN', 'ENDMAN', 'RCLTYPECD', 'POTAFF',
       'ODATE', 'INFLUENCED_BY', 'MFGTXT', 'RCDATE', 'DATEA', 'RPNO', 'FMVSS',
       'DESC_DEFECT', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION', 'NOTES',
       'RCL_CMPT_ID', 'MFR_COMP_NAME', 'MFR_COMP_DESC', 'MFR_COMP_PTNO'],
      dtype='object')

In [21]:
df_complaints.columns

Index(['ODINO', 'MFR_NAME', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'CRASH',
       'FAILDATE', 'FIRE', 'INJURED', 'DEATHS', 'COMPDESC', 'CITY', 'STATE',
       'VIN', 'DATEA', 'LDATE', 'MILES', 'OCCURENCES', 'CDESCR', 'CMPL_TYPE',
       'POLICE_RPT_YN', 'PURCH_DT', 'ORIG_OWNER_YN', 'ANTI_BRAKES_YN',
       'CRUISE_CONT_YN', 'NUM_CYLS', 'DRIVE_TRAIN', 'FUEL_SYS', 'FUEL_TYPE',
       'TRANS_TYPE', 'VEH_SPEED', 'DOT', 'TIRE_SIZE', 'LOC_OF_TIRE',
       'TIRE_FAIL_TYPE', 'ORIG_EQUIP_YN', 'MANUF_DT', 'SEAT_TYPE',
       'RESTRAINT_TYPE', 'DEALER_NAME', 'DEALER_TEL', 'DEALER_CITY',
       'DEALER_STATE', 'DEALER_ZIP', 'PROD_TYPE', 'REPAIRED_YN',
       'MEDICAL_ATTN', 'VEHICLES_TOWED_YN'],
      dtype='object')

In [22]:
set(df_complaints.columns) & set(df_recall.columns)

{'DATEA', 'MAKETXT', 'MODELTXT', 'YEARTXT'}

Other columns we should merge together aside from intersecting columns: 
- COMPNAME, COMPDESC. 
- MFR_NAME, MFG_NAME. 

# Preprocessing

### Setting up Text

In [23]:
fill_string_nulls(df_complaints)
fill_string_nulls(df_recall)

In [24]:
df_complaints["MMYTXT"] = (
    df_complaints["MAKETXT"] + " " + df_complaints["MODELTXT"] + " " + df_complaints["YEARTXT"].astype(str).fillna("")
)
df_complaints["CDESCR_CODE"] = pd.factorize(df_complaints['CDESCR'])[0]

df_recall["MMYTXT"] = (
    df_recall["MAKETXT"] + " " + df_recall["MODELTXT"] + " " + df_recall["YEARTXT"].astype(str).fillna("")
)
df_recall["CDESCR"] = (
    df_recall["DESC_DEFECT"]
    .str.cat(
        df_recall[["CONSEQUENCE_DEFECT"]],
        sep="\r\n"
    )
)
df_recall["CDESCR_CODE"] = pd.factorize(df_recall['CDESCR'])[0]


In [25]:
assert max(df_recall["CDESCR"].str.split("\r\n").str.len()) == 2, "Split is not clean"

In [26]:
# df_recall["CDESCR-"] = df_recall["DESC_DEFECT"].str.cat(
#     df_recall[["CONSEQUENCE_DEFECT", "CORRECTIVE_ACTION"]],
#     sep="- "
# )
# df_recall["CDESCR-"].str.split("- ").str.len().describe()

Ideally we want cdescr to always split into number of elements we desire. Let's use a different separator.

In [27]:
# df_complaints["MMYTXT"] = (
#     df_complaints["MAKETXT"] + " " + df_complaints["MODELTXT"] + " " + df_complaints["YEARTXT"].astype(str).fillna("")
# )
# df_recall["MMYTXT"] = (
#     df_recall["MAKETXT"] + " " + df_recall["MODELTXT"] + " " + df_recall["YEARTXT"].astype(str).fillna("")
# )
# df_recall["CDESCR"] = df_recall["DESC_DEFECT"].str.cat(
#     df_recall[["CONSEQUENCE_DEFECT", "CORRECTIVE_ACTION"]],
#     sep="\r\n"
# )

Better split :))

In [28]:
fill_string_spaces(df_complaints)
fill_string_spaces(df_recall)

In [29]:
trim_strings(df_complaints)
trim_strings(df_recall)

In [30]:
# df_recall["CDESCR"].str.split("<br>").str.len().describe()

In [31]:
# df_complaints["YEARTXT"] = df_complaints["YEARTXT"].astype(str) # None entries get converted to literal string 'None'
# df_recall["YEARTXT"] = df_recall["YEARTXT"].astype(str)

In [32]:
df_recall.columns

Index(['RECORD_ID', 'CAMPNO', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'MFGCAMPNO',
       'COMPNAME', 'MFGNAME', 'BGMAN', 'ENDMAN', 'RCLTYPECD', 'POTAFF',
       'ODATE', 'INFLUENCED_BY', 'MFGTXT', 'RCDATE', 'DATEA', 'RPNO', 'FMVSS',
       'DESC_DEFECT', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION', 'NOTES',
       'RCL_CMPT_ID', 'MFR_COMP_NAME', 'MFR_COMP_DESC', 'MFR_COMP_PTNO',
       'MMYTXT', 'CDESCR', 'CDESCR_CODE'],
      dtype='object')

In [33]:
df_complaints.columns

Index(['ODINO', 'MFR_NAME', 'MAKETXT', 'MODELTXT', 'YEARTXT', 'CRASH',
       'FAILDATE', 'FIRE', 'INJURED', 'DEATHS', 'COMPDESC', 'CITY', 'STATE',
       'VIN', 'DATEA', 'LDATE', 'MILES', 'OCCURENCES', 'CDESCR', 'CMPL_TYPE',
       'POLICE_RPT_YN', 'PURCH_DT', 'ORIG_OWNER_YN', 'ANTI_BRAKES_YN',
       'CRUISE_CONT_YN', 'NUM_CYLS', 'DRIVE_TRAIN', 'FUEL_SYS', 'FUEL_TYPE',
       'TRANS_TYPE', 'VEH_SPEED', 'DOT', 'TIRE_SIZE', 'LOC_OF_TIRE',
       'TIRE_FAIL_TYPE', 'ORIG_EQUIP_YN', 'MANUF_DT', 'SEAT_TYPE',
       'RESTRAINT_TYPE', 'DEALER_NAME', 'DEALER_TEL', 'DEALER_CITY',
       'DEALER_STATE', 'DEALER_ZIP', 'PROD_TYPE', 'REPAIRED_YN',
       'MEDICAL_ATTN', 'VEHICLES_TOWED_YN', 'MMYTXT', 'CDESCR_CODE'],
      dtype='object')

### Remove Duplicates (If Specified)

In [34]:
df_complaints["ODINO"]

0
2051723    11633472
2051724    11633473
2051725    11633474
2051726    11633475
2051727    11633475
             ...   
2068958    11645456
2068959    11645457
2068960    11645457
2068961    11645457
2068962    11645458
Name: ODINO, Length: 17239, dtype: int64

In [51]:
def process_columns_accordingly(df : pd.DataFrame, 
                                column_defining_uniqueness : str):
    duplicate_columns, non_duplicate_columns = find_duplicate_and_non_duplicate_columns(
        df, 
        column_defining_uniqueness
    )
    df_dup = df.groupby(column_defining_uniqueness).agg(
        {
            duplicate_column : lambda x: ', '.join(set(x.astype(str).fillna("")))
            for duplicate_column in duplicate_columns
        }
    )
    df_no_dup = df.groupby(column_defining_uniqueness).agg(
        {
            non_duplicate_column : "first"
            for non_duplicate_column in non_duplicate_columns
        }
    )
    df_size = df.groupby(column_defining_uniqueness).size()
    df_size.name = "NUMRECORDS"
    df_new = pd.concat(
        [
            df_dup,
            df_no_dup,
            df_size
        ],
        axis=1
    )
    return df_new


    


In [52]:
df_complaints_new = process_columns_accordingly(
    df_complaints, "CDESCR_CODE"
)

In [53]:
df_recall_new = process_columns_accordingly(
    df_recall, "CDESCR_CODE"
)

## Try speeding up function

In [54]:
def process_columns_accordingly(df : pd.DataFrame, 
                                column_defining_uniqueness : str):
    duplicate_columns, non_duplicate_columns = find_duplicate_and_non_duplicate_columns(
        df, 
        column_defining_uniqueness
    )
    df[duplicate_columns] = df[duplicate_columns].fillna("").astype(str) # Preprocess ahead of time
    grouper = df.groupby(column_defining_uniqueness) # Avoid group recomputes
    df_dup = grouper.agg(
        {
            duplicate_column : lambda x: ', '.join(pd.unique(x))
            for duplicate_column in duplicate_columns
        }
    )
    df_no_dup = grouper.agg(
        {
            non_duplicate_column : "first"
            for non_duplicate_column in non_duplicate_columns
        }
    )
    df_size = grouper.size()
    df_size.name = "NUMRECORDS"
    df_new = pd.concat(
        [
            df_dup,
            df_no_dup,
            df_size
        ],
        axis=1
    )
    return df_new


    


### Merge

In [55]:
df_complaints_new = process_columns_accordingly(
    df_complaints, "CDESCR_CODE"
)

In [56]:
df_recall_new = process_columns_accordingly(
    df_recall, "CDESCR_CODE"
)

In [59]:
df_recall_new = df_recall_new.rename(
    {
        "COMPNAME":"COMPDESC",
        "MFGNAME":"MFR_NAME"
    },
    axis=1
)

# Combine Recall and Complaints

In [60]:
df_complaints_new["IS_COMPLAINT"] = True
df_recall_new["IS_COMPLAINT"] = False

In [61]:
df_final = pd.concat(
    [df_complaints_new,
    df_recall_new]
).drop("CDESCR_CODE", axis=1).reset_index(drop=True)

In [62]:
len(df_final)

37422

In [63]:
df_final

Unnamed: 0,ODINO,MFR_NAME,MAKETXT,MODELTXT,YEARTXT,CRASH,FAILDATE,FIRE,INJURED,DEATHS,...,RCL_CMPT_ID,MFR_COMP_NAME,MFR_COMP_DESC,MFR_COMP_PTNO,BGMAN,DESC_DEFECT,CONSEQUENCE_DEFECT,RPNO,FMVSS,ENDMAN
0,11633472,Ford Motor Company,FORD,EXPLORER,2016,N,20240701,N,0,0,...,,,,,,,,,,
1,11633473,"Chrysler (FCA US, LLC)",JEEP,GLADIATOR,2022,N,20241231,N,0,0,...,,,,,,,,,,
2,11633474,Toyota Motor Corporation,TOYOTA,TUNDRA,2024,N,20241101,N,0,0,...,,,,,,,,,,
3,11633475,"General Motors, LLC",BUICK,ENVISTA,2024,N,20241229,N,0,0,...,,,,,,,,,,
4,11633476,Honda (American Honda Motor Co.),HONDA,PROLOGUE,2024,N,20241227,N,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37417,,"Tiffin Motorhomes, Inc.",TIFFIN,"PHAETON, ZEPHYR, ALLEGRO BUS","2023, 2022, 2024, 2025",,,,,,...,"000127267005430764000000212, 00012726700495275...",Module Multi Plex IFM,Chassis Control Module,CR0033,20220502.0,"Tiffin Motorhomes, Inc. (Tiffin) is recalling ...",An engine shutdown increases the risk of a crash.,,,20230522.0
37418,,"Maserati North America, Inc.",MASERATI,"GHIBLI, QUATTROPORTE, LEVANTE, MC20, MC20 CIEL...","2021, 2022, 2023, 2025, 2024",,,,,,...,"000127948004696784000001881, 00012794800469678...",Radio Software,Radio Software,"T27.52 for Ghibli, Quattroporte and Levante, U...",20200729.0,"Maserati North America, Inc. (Maserati) is rec...",A rearview camera image that does not display ...,,,20240124.0
37419,,Nova Bus (US) Inc.,NOVA BUS,LFS,"2005, 2006, 2007, 2008",,,,,,...,"000127951002185725000001483, 00012795100218572...",Window,WDO L SLDR 44% TEMP GREY PO,,20051125.0,Nova Bus (US) Inc. (Nova Bus) is recalling cer...,An insufficient number of emergency exits can ...,,,20071112.0
37420,,"Chrysler (FCA US, LLC)","DODGE, ALFA ROMEO","HORNET, TONALE","2025, 2024",,,,,,...,"000127532005643746000000063, 00012753200558519...",Brake Pedal,Brake Pedal,68607352AA,,"Chrysler (FCA US, LLC) is recalling certain 20...",A loss of brake function increases the risk of...,,,


In [144]:
df_final.columns.sort_values()

Index(['ANTI_BRAKES_YN', 'BGMAN', 'CAMPNO', 'CDESCR', 'CITY', 'CMPL_TYPE',
       'COMPDESC', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION', 'CRASH',
       'CRUISE_CONT_YN', 'DATEA', 'DEALER_CITY', 'DEALER_NAME', 'DEALER_STATE',
       'DEALER_TEL', 'DEALER_ZIP', 'DEATHS', 'DESC_DEFECT', 'DOT',
       'DRIVE_TRAIN', 'ENDMAN', 'FAILDATE', 'FIRE', 'FMVSS', 'FUEL_SYS',
       'FUEL_TYPE', 'INFLUENCED_BY', 'INJURED', 'IS_COMPLAINT', 'LDATE',
       'LOC_OF_TIRE', 'MAKETXT', 'MANUF_DT', 'MEDICAL_ATTN', 'MFGCAMPNO',
       'MFGTXT', 'MFR_COMP_DESC', 'MFR_COMP_NAME', 'MFR_COMP_PTNO', 'MFR_NAME',
       'MILES', 'MMYTXT', 'MODELTXT', 'NOTES', 'NUMRECORDS', 'NUM_CYLS',
       'OCCURENCES', 'ODATE', 'ODINO', 'ORIG_EQUIP_YN', 'ORIG_OWNER_YN',
       'POLICE_RPT_YN', 'POTAFF', 'PROD_TYPE', 'PURCH_DT', 'RCDATE',
       'RCLTYPECD', 'RCL_CMPT_ID', 'RECORD_ID', 'REPAIRED_YN',
       'RESTRAINT_TYPE', 'RPNO', 'SEAT_TYPE', 'STATE', 'TIRE_FAIL_TYPE',
       'TIRE_SIZE', 'TRANS_TYPE', 'VEHICLES_TOWED_YN', '

In [146]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37422 entries, 0 to 37421
Data columns (total 72 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ODINO               11627 non-null  object 
 1   MFR_NAME            37422 non-null  object 
 2   MAKETXT             37422 non-null  object 
 3   MODELTXT            37422 non-null  object 
 4   YEARTXT             37422 non-null  object 
 5   CRASH               11627 non-null  object 
 6   FAILDATE            11627 non-null  object 
 7   FIRE                11627 non-null  object 
 8   INJURED             11627 non-null  object 
 9   DEATHS              11627 non-null  object 
 10  COMPDESC            37422 non-null  object 
 11  CITY                11627 non-null  object 
 12  STATE               11627 non-null  object 
 13  VIN                 11627 non-null  object 
 14  DATEA               37422 non-null  object 
 15  LDATE               11627 non-null  object 
 16  CMPL

In [147]:
df_final.head()

Unnamed: 0,ODINO,MFR_NAME,MAKETXT,MODELTXT,YEARTXT,CRASH,FAILDATE,FIRE,INJURED,DEATHS,...,RCL_CMPT_ID,MFR_COMP_NAME,MFR_COMP_DESC,MFR_COMP_PTNO,RPNO,CONSEQUENCE_DEFECT,DESC_DEFECT,FMVSS,BGMAN,ENDMAN
0,11633472,Ford Motor Company,FORD,EXPLORER,2016,N,20240701,N,0,0,...,,,,,,,,,,
1,11633473,"Chrysler (FCA US, LLC)",JEEP,GLADIATOR,2022,N,20241231,N,0,0,...,,,,,,,,,,
2,11633474,Toyota Motor Corporation,TOYOTA,TUNDRA,2024,N,20241101,N,0,0,...,,,,,,,,,,
3,11633475,"General Motors, LLC",BUICK,ENVISTA,2024,N,20241229,N,0,0,...,,,,,,,,,,
4,11633476,Honda (American Honda Motor Co.),HONDA,PROLOGUE,2024,N,20241227,N,0,0,...,,,,,,,,,,


In [148]:
df_final.to_csv(
    f"{DATASET_DIR}/complaints_and_recalls.csv"
)

# Check our Python pipeline outputs

In [65]:
test_agg = pd.read_csv(f"{DATASET_DIR}/test_agg.csv")

  test_agg = pd.read_csv(f"{DATASET_DIR}/test_agg.csv")


In [71]:
test_agg["IS_COMPLAINT"].value_counts()

IS_COMPLAINT
False    25742
True     11616
Name: count, dtype: int64

In [66]:
test_no_agg = pd.read_csv(f"{DATASET_DIR}/test_no_agg.csv")

  test_no_agg = pd.read_csv(f"{DATASET_DIR}/test_no_agg.csv")


In [72]:
test_no_agg["IS_COMPLAINT"].value_counts()

IS_COMPLAINT
False    293171
True      17239
Name: count, dtype: int64

In [75]:
test_agg.notnull().sum(axis=0).sort_values()

OCCURENCES         0
FUEL_SYS           0
DRIVE_TRAIN        0
NUM_CYLS           0
TRANS_TYPE         0
               ...  
MAKETXT        37358
Unnamed: 0     37358
YEARTXT        37358
MODELTXT       37358
DATEA          37358
Length: 73, dtype: int64

Unamed: 0 is the index

In [68]:
len(test_no_agg)

310410

In [74]:
test_no_agg.notnull().sum(axis=0).sort_values()

DRIVE_TRAIN         0
NUM_CYLS            0
TRANS_TYPE          0
FUEL_SYS            0
OCCURENCES          0
                ...  
COMPDESC       310410
MAKETXT        310410
Unnamed: 0     310410
YEARTXT        310410
MODELTXT       310410
Length: 72, dtype: int64

Unamed: 0 is the index

Some of these columns shouldn't all be null.

I looked at the previous complaints dataset, looks like occurrences is actually null....