In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

### Loading the Dataset

In [100]:
df = pd.read_csv('./preprocessed_dataset.csv')
df.sample(5)

Unnamed: 0,category,rating,label,text_,cleaned_reviews,tokenised_reviews
28218,Kindle_Store_5,5.0,CG,"['recieved', 'arc', 'copy', 'book', 'exchange'...",recieved arc copy book exchange honest reviewt...,[0. 0. 0. ... 0. 0. 0.]
36343,Toys_and_Games_5,4.0,CG,"['son', 'toy', 'material', 'good', 'love', 'fi...",son toy material good love figure color bright...,[0. 0. 0. ... 0. 0. 0.]
10113,Electronics_5,5.0,OR,"['say', 'thisit', 'basically', 'new', 'came', ...",say thisit basically new came orginal box conn...,[0. 0. 0. ... 0. 0. 0.]
4968,Sports_and_Outdoors_5,4.0,CG,"['sugary', 'tasty', 'similar', 'ice', 'cream',...",sugary tasty similar ice cream cone also love ...,[0. 0. 0. ... 0. 0. 0.]
31371,Books_5,5.0,CG,"['loved', 'book', 'usual', 'character', 'well'...",loved book usual character well developed beli...,[0. 0. 0. ... 0. 0. 0.]


In [101]:
def text_processor(review):
  nopunc = [char for char in review if char not in string.punctuation]
  nopunc = ''.join(nopunc)
  return [word for word in nopunc.split()]

In [98]:
bow = CountVectorizer(analyzer=text_processor)
bow.fit(df['text_'])
print("Total Vocabulary:",len(bow.vocabulary_))

Total Vocabulary: 43454


### Train-Test Split

In [102]:
X = df['text_']
y = df['label']
X.sample(5), y.sample(5)

(15838    ['easy', 'install', 'easy', 'hook', 'bike', 'g...
 7673     ['purchased', 'one', 'navy', 'speedo', 'outdon...
 22911    ['great', 'idea', 'design', 'isnt', 'good', 'e...
 32135    ['first', 'title', 'book', 'misleading', 'titl...
 11064    ['slim', 'mouse', 'sleek', 'feel', 'comfortabl...
 Name: text_, dtype: object,
 23176    OR
 33766    CG
 6856     OR
 22249    OR
 26202    OR
 Name: label, dtype: object)

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create Pipeline to train models using Random Forest, SVM, and Logistic Regression

In [104]:
pipelines = {
  'randomForest': Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=42))
  ]),
  'svc': Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(random_state=42))
  ]),
  'logistic': Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=42))
  ])
}

### Model Fitting and Saving them

In [105]:
for model, pipeline in pipelines.items():
  print(f'Training a {model} classifier')
  pipeline.fit(X_train, y_train)
  joblib.dump(pipeline, f'{model}_model.pkl')

Training a randomForest classifier
Training a svc classifier
Training a logistic classifier


### Evaluation using Accuracy, Precision, Recall, and F1 score

In [110]:
metrics = {
  'accuracy': accuracy_score,
  'precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='weighted', zero_division=0),
  'recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='weighted', zero_division=0),
  'f1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted', zero_division=0)
}

results = {}

for model, pipeline in pipelines.items():
  y_pred = pipeline.predict(X_test)
  y_test_np = y_test.to_numpy()
  model_metrics = {}  
  for metric_name, value in metrics.items():
    model_metrics[metric_name] = value(y_test_np, y_pred)
  
  results[model] = model_metrics

### Printing Accuracy, Precision, Recall, and F1 score

In [128]:
for model, metrics in results.items():
  print(f'{model[0].upper() + model[1:]}')
  for metric_name, value in metrics.items():
    print(f'{metric_name} {value: .4f}')
  print()

RandomForest
accuracy  0.8522
precision  0.8549
recall  0.8522
f1  0.8519

Svc
accuracy  0.8926
precision  0.8930
recall  0.8926
f1  0.8926

Logistic
accuracy  0.8711
precision  0.8714
recall  0.8711
f1  0.8711



### Computing the Best Model

In [129]:
model_performance_measure = {}

for model, metrics in results.items():
  metricSum = 0
  for metric_name, value in metrics.items():
    metricSum += value
  metricSum /= 4
  model_performance_measure[model] = metricSum

model_performance_measure

{'randomForest': 0.8527816857969919,
 'svc': 0.892723299096074,
 'logistic': 0.8711802105152182}

Since SVC has the highest weighted performance metric sum, it can be said as the best model we can use to fit our data.

### Testing our Model

In [131]:
sample_data = [
  "This product is great, I loved it!",
  "Terrible experience, wouldn't recommend it."
]

for model_name in pipelines:
  model = joblib.load(f"{model_name}_model.pkl")
  predictions = model.predict(sample_data)
  print(f"{model_name} predictions: {predictions}")

randomForest predictions: ['OR' 'OR']
svc predictions: ['CG' 'OR']
logistic predictions: ['CG' 'OR']
