In [2]:
import pandas as pd

In [43]:
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'eval': 'data/eval-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
# df = pd.read_parquet("hf://datasets/vic35get/nhtsa_complaints_dataset/" + splits["train"])

# df = pd.concat(
#     [pd.read_parquer(splits[split]) for split in splits],
#         keys=splits.keys()

# )

df = pd.concat(
    [
        pd.read_parquet(f"hf://datasets/vic35get/nhtsa_complaints_dataset/{splits[s]}")
        for s in ['train', 'test', 'eval']
    ],
    ignore_index = True
)
df.head ()

Unnamed: 0,odiNumber,dateComplaintFiled,components,summary,label
0,10682290,2015-02-09,AIR BAGS,letter from senator nelson on behalf of consti...,AIR BAGS
1,10726262,2015-06-19,ELECTRICAL SYSTEM,tl* the contact owns a 1999 gmc sonoma. while ...,ELECTRICAL SYSTEM
2,10957250,2017-02-27,STEERING,tl* the contact owns a 2005 chevrolet malibu m...,OTHER
3,10725806,2015-06-17,ELECTRICAL SYSTEM,tl* the contact owns a 1997 saturn sw. the con...,ELECTRICAL SYSTEM
4,11057964,2018-01-02,AIR BAGS,tl* the contact owns a 2004 ford ranger. while...,AIR BAGS


In [10]:
df.isnull().sum()

Unnamed: 0,0
odiNumber,0
dateComplaintFiled,0
components,0
summary,0
label,0


In [11]:
df.head()

Unnamed: 0,odiNumber,dateComplaintFiled,components,summary,label
0,10682290,2015-02-09,AIR BAGS,letter from senator nelson on behalf of consti...,AIR BAGS
1,10726262,2015-06-19,ELECTRICAL SYSTEM,tl* the contact owns a 1999 gmc sonoma. while ...,ELECTRICAL SYSTEM
2,10957250,2017-02-27,STEERING,tl* the contact owns a 2005 chevrolet malibu m...,OTHER
3,10725806,2015-06-17,ELECTRICAL SYSTEM,tl* the contact owns a 1997 saturn sw. the con...,ELECTRICAL SYSTEM
4,11057964,2018-01-02,AIR BAGS,tl* the contact owns a 2004 ford ranger. while...,AIR BAGS


In [46]:
df = df[['summary', 'label']]
df.head()

Unnamed: 0,summary,label
0,letter from senator nelson on behalf of consti...,AIR BAGS
1,the contact owns a 1999 gmc sonoma. while dri...,ELECTRICAL SYSTEM
2,the contact owns a 2005 chevrolet malibu maxx...,OTHER
3,the contact owns a 1997 saturn sw. the contac...,ELECTRICAL SYSTEM
4,the contact owns a 2004 ford ranger. while dr...,AIR BAGS


In [66]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
OTHER,3790
ELECTRICAL SYSTEM,2248
STRUCTURE,2202
AIR BAGS,2188
SERVICE BRAKES,2106


In [70]:
df = df[df['label'] != 'OTHER']

In [71]:
import re


def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = str(text).replace('tl*', '')
    return text

In [72]:
df['summary'] = df['summary'].apply(clean_text)
df.head()

Unnamed: 0,summary,label
0,letter from senator nelson on behalf of consti...,AIR BAGS
1,the contact owns a 1999 gmc sonoma. while driv...,ELECTRICAL SYSTEM
3,the contact owns a 1997 saturn sw. the contact...,ELECTRICAL SYSTEM
4,the contact owns a 2004 ford ranger. while dri...,AIR BAGS
5,the issue with my toyota is a rusted frame.rus...,STRUCTURE


In [73]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df['summary'], df['label'], test_size=0.25)

In [74]:
# TF IDF, figures out important words in a string

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [76]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train_tfidf, y_train)

y_pred = rf.predict(X_test_tfidf)

In [77]:
print(confusion_matrix(y_test, y_pred))

[[539  12   1   1]
 [ 27 501  15  27]
 [  4  18 498   7]
 [ 25  31   8 472]]


In [78]:
print(classification_report(y_test, y_pred))

                   precision    recall  f1-score   support

         AIR BAGS       0.91      0.97      0.94       553
ELECTRICAL SYSTEM       0.89      0.88      0.89       570
   SERVICE BRAKES       0.95      0.94      0.95       527
        STRUCTURE       0.93      0.88      0.91       536

         accuracy                           0.92      2186
        macro avg       0.92      0.92      0.92      2186
     weighted avg       0.92      0.92      0.92      2186



In [61]:
!pip install -q gradio

In [62]:
import gradio as gr

In [81]:
def predict(text):
  text=clean_text(text)
  text=vectorizer.transform([text])
  return rf.predict(text)[0]

In [82]:
predict('the car engine stopped randomly')

'ELECTRICAL SYSTEM'

In [83]:
predict('the car engine stopped')

'ELECTRICAL SYSTEM'

In [84]:
predict('the wheel fell off')

'STRUCTURE'

In [87]:
interface =gr.Interface(
    fn =predict,
    inputs =gr.Textbox(lines =3, placeholder= 'Enter Complaint'),
    outputs ='text',
    title = 'Tobiko'
)



In [88]:
interface.launch(share=True, debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ef2f8f0d9d69b3bc6e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


