# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta

# Load Data

In [2]:
golden_1 = pd.read_excel("./data/P1-Golden.xlsx")
golden_1.head()

Unnamed: 0,reviews,Judgement
0,Well done Microsoft!! OneNote is a fantastic p...,1
1,This works on a limited basis. Works well for...,1
2,This app was great for the first month or two....,1
3,I can't update it in my phone it takes way too...,1
4,Why is it so big?!!,1


In [3]:
golden_2 = pd.read_excel("./data/P2-Golden.xlsx")
golden_2.head()

Unnamed: 0,reviews,Judgement
0,Now I bought an iPhone six running iOS 8.,0
1,It installed fine.,0
2,"However, it won't let me sign on. it doesn't s...",1
3,I've used Viber for a very long time and it's ...,0
4,The phone calls and the chatting is what I uti...,1


# Utility functions

In [4]:
def clean_text(text, remove_stop = False):
    tokens = word_tokenize(text)
    # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    # Lower the tokens
    tokens = [word.lower() for word in tokens]
    if remove_stop:
        # Remove stopword
        tokens = [word for word in tokens if not word in stopwords.words("english")]
    # Lemmatize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos = "v") for word in tokens]
    tokens = [lemma.lemmatize(word, pos = "n") for word in tokens]
    return tokens

In [5]:
def get_vector(total_doc):
    text_corpus = []
    tokens = defaultdict(int)
    for doc in total_doc:
        tk = clean_text(doc)
        for t in tk:
            tokens[t]+=1
        text_corpus.append(tk)
    text_vectors = []
    token_count = len(tokens)
    token_index = {}
    idx = 0
    for token in tokens:
        token_index[token] = idx
        idx+=1

    for text in text_corpus:
        tmp_vector = np.zeros(token_count)
        for tok in text:
            tmp_vector[token_index[tok]]=1
        text_vectors.append(tmp_vector)
    return text_vectors
    

# Models

## Naive Bayes

### Task P1

In [6]:
x = np.array(get_vector(golden_1['reviews']))
y = np.array(golden_1['Judgement'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)
learner = GaussianNB()
learner = learner.fit(X_train, y_train)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test == y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test!= y_pred).sum(),acc))
print(classification_report(y_test, y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 50 points : 9, accuracy = 0.820000
              precision    recall  f1-score   support

           0       0.83      0.79      0.81        24
           1       0.81      0.85      0.83        26

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.82      0.82      0.82        50

Processing Time: 0.0


### Task P2

In [7]:
x = np.array(get_vector(golden_2['reviews']))
y = np.array(golden_2['Judgement'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)
learner = GaussianNB()
learner = learner.fit(X_train, y_train)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test== y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test!= y_pred).sum(),acc))
print(classification_report(y_test, y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 63 points : 18, accuracy = 0.714286
              precision    recall  f1-score   support

           0       0.75      0.82      0.79        40
           1       0.63      0.52      0.57        23

    accuracy                           0.71        63
   macro avg       0.69      0.67      0.68        63
weighted avg       0.71      0.71      0.71        63

Processing Time: 0.01564


## TF-IDF + Sklearn learners approach

### Task P1

In [8]:
vectorizer = TfidfVectorizer()
corpus = golden_1['reviews']
x = vectorizer.fit_transform(corpus).toarray()
y = np.array(golden_1['Judgement'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)

learner = SVC(gamma=2, C=1)
learner = learner.fit(X_train, y_train)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test== y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test!= y_pred).sum(),acc))
print(classification_report(y_test, y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 50 points : 7, accuracy = 0.860000
              precision    recall  f1-score   support

           0       0.90      0.79      0.84        24
           1       0.83      0.92      0.87        26

    accuracy                           0.86        50
   macro avg       0.87      0.86      0.86        50
weighted avg       0.86      0.86      0.86        50

Processing Time: 0.093747


### Task P2

In [9]:
vectorizer = TfidfVectorizer()
corpus = golden_2['reviews']
x = vectorizer.fit_transform(corpus).toarray()
y = np.array(golden_2['Judgement'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
learner = SVC(gamma=2, C=1)
learner = learner.fit(X_train, y_train)

start_time = datetime.now()
y_pred = learner.predict(X_test)
acc =  (y_test== y_pred).sum()/X_test.shape[0]

print("Number of mislabeled points out of a total %d points : %d, accuracy = %f"% (X_test.shape[0], (y_test!= y_pred).sum(),acc))
print(classification_report(y_test, y_pred))
print(f"Processing Time: {(datetime.now() - start_time).total_seconds()}")

Number of mislabeled points out of a total 249 points : 59, accuracy = 0.763052
              precision    recall  f1-score   support

           0       0.73      0.92      0.82       142
           1       0.83      0.56      0.67       107

    accuracy                           0.76       249
   macro avg       0.78      0.74      0.74       249
weighted avg       0.78      0.76      0.75       249

Processing Time: 0.453092
