# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta

# Load Data

In [77]:
train_X_original = pd.read_csv("../data/original_data_X_train.csv")
train_y_original = pd.read_csv("../data/original_data_y_train.csv")
augmented_data = pd.read_csv("../data/balanced_augmentation_dataset.csv")
X_test = pd.read_csv("../data/original_data_X_test.csv")
y_test = pd.read_csv("../data/original_data_y_test.csv")

augmented_data=augmented_data[["Reviews", "Useful?"]]
augmented_data.rename(columns={"Reviews":"reviews", "Useful?":"Judgement"}, inplace=True)
X_train_augmented = pd.concat([train_X_original, augmented_data[["reviews"]]], axis=0, ignore_index=True)
y_train_augmented = pd.concat([train_y_original, augmented_data[["Judgement"]]], axis=0, ignore_index=True)

In [102]:
augmented_data

Unnamed: 0,reviews,Judgement
0,I give it 5 stars it almost had everything I ...,1
1,I like this app and I segues to make stronger ...,1
2,So far does everything I want it to. \n,0
3,This app provides a lot of value for my phone ...,0
4,This is a wonderful app for figuring out what ...,0
...,...,...
1727,Nowhere close to Android version that I've bee...,1
1728,I was at one point happy with this map until N...,1
1729,I somewhat agree with one other review about t...,1
1730,Calibration does not work right on the update ...,1


# Utility functions

In [78]:
def clean_text(text, remove_stop = False):
    tokens = word_tokenize(text)
    # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    # Lower the tokens
    tokens = [word.lower() for word in tokens]
    if remove_stop:
        # Remove stopword
        tokens = [word for word in tokens if not word in stopwords.words("english")]
    # Lemmatize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos = "v") for word in tokens]
    tokens = [lemma.lemmatize(word, pos = "n") for word in tokens]
    return tokens

In [79]:
def get_vector(total_doc):
    text_corpus = []
    tokens = defaultdict(int)
    for doc in total_doc:
        tk = clean_text(doc)
        for t in tk:
            tokens[t]+=1
        text_corpus.append(tk)
    text_vectors = []
    token_count = len(tokens)
    token_index = {}
    idx = 0
    for token in tokens:
        token_index[token] = idx
        idx+=1

    for text in text_corpus:
        tmp_vector = np.zeros(token_count)
        for tok in text:
            tmp_vector[token_index[tok]]=1
        text_vectors.append(tmp_vector)
    return text_vectors
    

# Models

## Naive Bayes

### Task P1

#### Before Augmentation

In [80]:
# x = np.array(get_vector(golden_1['reviews']))
# y = np.array(golden_1['Judgement'])

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)
# X_train = np.array(get_vector(X_train['reviews']))
# X_test = np.array(get_vector(X_test['reviews']))
X = pd.concat([train_X_original, X_test])
X = np.array(get_vector(X['reviews']))
X_train = X[:train_X_original.shape[0]]
X_test = X[-X_test.shape[0]:]
learner = GaussianNB()
learner = learner.fit(X_train, train_y_original)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test["Judgement"] == y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test["Judgement"]!= y_pred).sum(),acc))
print(classification_report(y_test["Judgement"], y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 200 points : 39, accuracy = 0.805000
              precision    recall  f1-score   support

           0       0.86      0.75      0.80       105
           1       0.76      0.86      0.81        95

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.80       200
weighted avg       0.81      0.81      0.80       200

Processing Time: 0.013959


  y = column_or_1d(y, warn=True)


#### After Augmentation

In [83]:
train_X_original = pd.read_csv("../data/original_data_X_train.csv")
train_y_original = pd.read_csv("../data/original_data_y_train.csv")
augmented_data = pd.read_csv("../data/balanced_augmentation_dataset.csv")
X_test = pd.read_csv("../data/original_data_X_test.csv")
y_test = pd.read_csv("../data/original_data_y_test.csv")

augmented_data=augmented_data[["Reviews", "Useful?"]]
augmented_data.rename(columns={"Reviews":"reviews", "Useful?":"Judgement"}, inplace=True)
X_train_augmented = pd.concat([train_X_original, augmented_data[["reviews"]]], axis=0, ignore_index=True)
y_train_augmented = pd.concat([train_y_original, augmented_data[["Judgement"]]], axis=0, ignore_index=True)

In [84]:
# x = np.array(get_vector(golden_1['reviews']))
# y = np.array(golden_1['Judgement'])

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)
# X_train = np.array(get_vector(X_train['reviews']))
# X_test = np.array(get_vector(X_test['reviews']))
X = pd.concat([X_train_augmented, X_test])
X = np.array(get_vector(X['reviews']))
X_train = X[:X_train_augmented.shape[0]]
X_test = X[-X_test.shape[0]:]
learner = GaussianNB()
learner = learner.fit(X_train, y_train)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test["Judgement"] == y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test["Judgement"]!= y_pred).sum(),acc))
print(classification_report(y_test["Judgement"], y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

  y = column_or_1d(y, warn=True)


Number of mislabeled points out of a total 200 points : 58, accuracy = 0.710000
              precision    recall  f1-score   support

           0       0.67      0.90      0.76       105
           1       0.81      0.51      0.62        95

    accuracy                           0.71       200
   macro avg       0.74      0.70      0.69       200
weighted avg       0.74      0.71      0.70       200

Processing Time: 0.024956


### Task P2

In [50]:
# x = np.array(get_vector(golden_2['reviews']))
# y = np.array(golden_2['Judgement'])

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)
# learner = GaussianNB()
# learner = learner.fit(X_train, y_train)

# start_time = datetime.now()
# y_pred = learner.predict(X_test)
# acc =  (y_test== y_pred).sum()/X_test.shape[0]

# print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test!= y_pred).sum(),acc))
# print(classification_report(y_test, y_pred))
# print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

## TF-IDF + Sklearn learners approach

#### Before Augmentations

In [96]:
train_X_original = pd.read_csv("../data/original_data_X_train.csv")
train_y_original = pd.read_csv("../data/original_data_y_train.csv")
augmented_data = pd.read_csv("../data/balanced_augmentation_dataset.csv")
X_test = pd.read_csv("../data/original_data_X_test.csv")
y_test = pd.read_csv("../data/original_data_y_test.csv")

augmented_data=augmented_data[["Reviews", "Useful?"]]
augmented_data.rename(columns={"Reviews":"reviews", "Useful?":"Judgement"}, inplace=True)
X_train_augmented = pd.concat([train_X_original, augmented_data[["reviews"]]], axis=0, ignore_index=True)
y_train_augmented = pd.concat([train_y_original, augmented_data[["Judgement"]]], axis=0, ignore_index=True)

### Task P1

In [97]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train_X_original["reviews"]).toarray()
X_test = vectorizer.transform(X_test["reviews"]).toarray()
learner = SVC(gamma=2, C=1 ,random_state=0)
learner = learner.fit(X_train, train_y_original["Judgement"])

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test["Judgement"]== y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test["Judgement"]!= y_pred).sum(),acc))
print(classification_report(y_test["Judgement"], y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 200 points : 34, accuracy = 0.830000
              precision    recall  f1-score   support

           0       0.90      0.76      0.82       105
           1       0.77      0.91      0.83        95

    accuracy                           0.83       200
   macro avg       0.84      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200

Processing Time: 0.343426


#### After Augmentation

In [93]:
train_X_original = pd.read_csv("../data/original_data_X_train.csv")
train_y_original = pd.read_csv("../data/original_data_y_train.csv")
augmented_data = pd.read_csv("../data/balanced_augmentation_dataset.csv")
X_test = pd.read_csv("../data/original_data_X_test.csv")
y_test = pd.read_csv("../data/original_data_y_test.csv")

augmented_data=augmented_data[["Reviews", "Useful?"]]
augmented_data.rename(columns={"Reviews":"reviews", "Useful?":"Judgement"}, inplace=True)
X_train_augmented = pd.concat([train_X_original, augmented_data[["reviews"]]], axis=0, ignore_index=True)
y_train_augmented = pd.concat([train_y_original, augmented_data[["Judgement"]]], axis=0, ignore_index=True)

In [95]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train_augmented["reviews"]).toarray()
X_test = vectorizer.transform(X_test["reviews"]).toarray()
learner = SVC(gamma=2, C=1, random_state=0)
learner = learner.fit(X_train, y_train_augmented["Judgement"])

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test["Judgement"]== y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test["Judgement"]!= y_pred).sum(),acc))
print(classification_report(y_test["Judgement"], y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 200 points : 38, accuracy = 0.810000
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       105
           1       0.85      0.73      0.78        95

    accuracy                           0.81       200
   macro avg       0.82      0.81      0.81       200
weighted avg       0.81      0.81      0.81       200

Processing Time: 2.360059


### Task P2

In [9]:
vectorizer = TfidfVectorizer()
corpus = golden_2['reviews']
x = vectorizer.fit_transform(corpus).toarray()
y = np.array(golden_2['Judgement'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
learner = SVC(gamma=2, C=1)
learner = learner.fit(X_train, y_train)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test== y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test!= y_pred).sum(),acc))
print(classification_report(y_test, y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 249 points : 59, accuracy = 0.763052
              precision    recall  f1-score   support

           0       0.73      0.92      0.82       142
           1       0.83      0.56      0.67       107

    accuracy                           0.76       249
   macro avg       0.78      0.74      0.74       249
weighted avg       0.78      0.76      0.75       249

Processing Time: 0.453092
