In [None]:
!pip install wandb -qU

In [1]:
import pickle
import pandas as pd
import random
import json
import wandb
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

wandb.login(key="aec6fef7ba56ee445129472eb583718b8e529934")

In [2]:
with open('../../data/dataset/reviews/train.pkl', 'rb') as f:
    train_set = pickle.load(f)
    
with open('../../data/dataset/reviews/val.pkl', 'rb') as f:
    val_set = pickle.load(f)

In [3]:
train = pd.DataFrame.from_dict(train_set)
val =  pd.DataFrame.from_dict(val_set)

train = train.dropna()
val = val.dropna()

X_train = train["x"]
X_val = val["x"]
Y_train = train["y"]
Y_val = val["y"]

# Feature Extraction
__TF-IDF__: It is a method of extracting the features from the text data. TF stands for Term Frequency and IDF stands for Inverse Document Frequency.

__Term Frequency__: Number of times word occurs in a review. For an example consider 2 reviews where w1,w2.. represents the words in both reviews and table defines the frequency of words in the particular review.

IDF is computed as :

    idf(t) = log [ n / df(t) ] + 1
         = log [ number of documents / number of documents containing the term]+1
         
         
If smooth_idf=True.
        Smooth-IDF = log [ n / df(t) +1 ] + 1         


TF-IDF is implemented using sklearn : [link](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)




In [8]:
print("TFIDF Vectorizer")
vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_val = vectorizer.transform(X_val)

TFIDF Vectorizer……


# Random

## Simple Random

In [None]:
run = wandb.init(project="Base-Lines", name="Simple-Random")

label = ["Fresh","Rotten"]
y_pred = []
for i in range(len(Y_val)):
    y_pred.append(random.choice(label))

metrics = classification_report(Y_val, y_pred, output_dict=True)

performance = {"accuracy": metrics["accuracy"],
               "precision": metrics["weighted avg"]['precision'],
               "recall": metrics["weighted avg"]['recall'],
               "f1": metrics["weighted avg"]['f1-score']}
print (json.dumps(performance, indent=2))

wandb.log(performance)

## Weighted Random

We made the assumption that there is an equal probability for every class. Let's use the train split to figure out what the true probability is.

In [None]:
run = wandb.init(project="Base-Lines", name="Weighted-Random")

p=[]
for i in label:
     p.append(collections.Counter(Y_val)[i]/len(Y_val))

y_pred = []
for i in range(len(Y_val)):
    y_pred.append(random.choices(label, weights=p ,k=1))

metrics = classification_report(Y_val, y_pred, output_dict=True)

performance_weighted_random = {"accuracy": metrics["accuracy"],
               "precision": metrics["weighted avg"]['precision'],
               "recall": metrics["weighted avg"]['recall'],
               "f1": metrics["weighted avg"]['f1-score']}
print (json.dumps(performance_weighted_random, indent=2))

wandb.log(performance)

Counter({'Fresh': 60733, 'Rotten': 34179})

# SVM

## Linear SVM

In [19]:
run = wandb.init(project="Base-Lines", name="Linear-SVM")

clf = LinearSVC(random_state=42, tol=1e-5, verbose=1, C=5.0)
clf.fit(tf_x_train, Y_train)
y_pred = clf.predict(tf_x_val)

metrics_SVM = classification_report(Y_val, y_pred, output_dict=True)

performance_SVM = {"accuracy": metrics_SVM["accuracy"],
               "precision": metrics_SVM["weighted avg"]['precision'],
               "recall": metrics_SVM["weighted avg"]['recall'],
               "f1": metrics_SVM["weighted avg"]['f1-score']}
print(json.dumps(performance_SVM, indent=2))

wandb.log(performance)

## RBF SVM

In [None]:
run = wandb.init(project="Base-Lines", name="RBF-SVM")

clf = SVC(C=5.0, kernel='rbf', random_state=42, max_iter=1000, verbose=1)
clf.fit(tf_x_train, train['y'])
y_pred = clf.predict(tf_x_val)

metrics_SVM = classification_report(Y_val, y_pred, output_dict=True)

performance_SVM = {"accuracy": metrics_SVM["accuracy"],
               "precision": metrics_SVM["weighted avg"]['precision'],
               "recall": metrics_SVM["weighted avg"]['recall'],
               "f1": metrics_SVM["weighted avg"]['f1-score']}
print(json.dumps(performance_SVM, indent=2))

wandb.log(performance)

## Poly SVM

In [None]:
run = wandb.init(project="Base-Lines", name="Poly-SVM")

clf = SVC(C=5.0, kernel='poly', degree=5, random_state=42, max_iter=1000, verbose=1)
clf.fit(tf_x_train, train['y'])
y_pred = clf.predict(tf_x_val)

metrics_SVM = classification_report(Y_val, y_pred, output_dict=True)

performance_SVM = {"accuracy": metrics_SVM["accuracy"],
               "precision": metrics_SVM["weighted avg"]['precision'],
               "recall": metrics_SVM["weighted avg"]['recall'],
               "f1": metrics_SVM["weighted avg"]['f1-score']}
print(json.dumps(performance_SVM, indent=2))

wandb.log(performance)

# Logistic Regression

In [14]:

clf = LogisticRegression(max_iter=1000,solver="saga")
clf.fit(tf_x_train,Y_train)
y_pred = clf.predict(tf_x_val)

metrics_SVM = classification_report(Y_val, y_pred, output_dict=True)

performance_SVM = {"accuracy": metrics_SVM["accuracy"],
               "precision": metrics_SVM["weighted avg"]['precision'],
               "recall": metrics_SVM["weighted avg"]['recall'],
               "f1": metrics_SVM["weighted avg"]['f1-score']}
print(json.dumps(performance_SVM, indent=2))

wandb.log(performance)

# Random Forest

In [None]:
run = wandb.init(project="Base-Lines", name="RandomForest")

clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(tf_x_train, train['y'])
y_pred = clf.predict(tf_x_val)

metrics_SVM = classification_report(Y_val, y_pred, output_dict=True)

performance_SVM = {"accuracy": metrics_SVM["accuracy"],
               "precision": metrics_SVM["weighted avg"]['precision'],
               "recall": metrics_SVM["weighted avg"]['recall'],
               "f1": metrics_SVM["weighted avg"]['f1-score']}
print(json.dumps(performance_SVM, indent=2))

wandb.log(performance)

# xgboost

In [None]:
run = wandb.init(project="Base-Lines", name="xgboost")

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
clf.fit(tf_x_train, train['y'])
y_pred = clf.predict(tf_x_val)

metrics_SVM = classification_report(Y_val, y_pred, output_dict=True)

performance_SVM = {"accuracy": metrics_SVM["accuracy"],
               "precision": metrics_SVM["weighted avg"]['precision'],
               "recall": metrics_SVM["weighted avg"]['recall'],
               "f1": metrics_SVM["weighted avg"]['f1-score']}
print(json.dumps(performance_SVM, indent=2))

wandb.log(performance)