In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as sa
from nltk.stem import WordNetLemmatizer 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
pd.options.mode.chained_assignment = None
import pickle

In [18]:
training = pd.read_csv('yelp_review_polarity_csv/train.csv', sep = ',', names =  ['attitude','text'])
test = pd.read_csv('yelp_review_polarity_csv/test.csv', sep = ',', names =  ['attitude','text'])

In [26]:
def unification(data_cleaned):
    print("uni")
    #Making all letters lowercase
    data_cleaned['text'] = data_cleaned['text'].apply(lambda x: x.lower())
    #Removing Punctuation, Symbols
    data_cleaned['text'] = data_cleaned['text'].str.replace('[^\w\s]',' ')
    #Removing Stop Words
    stop_words = stopwords.words('english')
    for i in range(len(data_cleaned)):
        if i%10000==0:
            print(i)
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            if word not in stop_words:
                text+=' '+word
        data_cleaned.loc[i]['text'] = text.strip(' ')
    
    return data_cleaned

def lemmatisation(data_cleaned):
    print("lemma")
    lemmatizer = WordNetLemmatizer()
    for i in range(len(data_cleaned)):
        if i%10000==0:
            print(i)
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            text+=' '+lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word),pos = 'a'), pos = 'v')
        data_cleaned.loc[i]['text'] = text.strip(' ')
    return data_cleaned

word_dict = {}
def removing_rare_words(data_cleaned, num_to_ignore = 5000):
    print("removing")
    word_dict = {}
    for i in range(len(data_cleaned)):
        for word in data_cleaned.loc[i]['text'].split():
            if word in word_dict:
                word_dict[word]+=1
            else:
                word_dict[word] = 1
    rarest_words = sorted(list(word_dict.keys()), key = lambda x:word_dict[x])[:num_to_ignore]

    for i in range(len(data_cleaned)):
        if i%10000==0:
            print(i)
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            if word not in rarest_words:
                text+=' '+word
        data_cleaned.loc[i]['text'] = text.strip(' ')
    return data_cleaned

def removing_rare_words_test(data_cleaned, num_to_ignore = 5000):
    rarest_words = sorted(list(word_dict.keys()), key = lambda x:word_dict[x])[:num_to_ignore]
    for i in range(len(data_cleaned)):
        if i%10000==0:
            print(i)
        text = ''
        for word in data_cleaned.loc[i]['text'].split():
            if word not in rarest_words:
                text+=' '+word
        data_cleaned.loc[i]['text'] = text.strip(' ')
    return data_cleaned
    
def data_cleaning(data_cleaned, num_to_ignore = 5000):
    return removing_rare_words(lemmatisation(unification(data_cleaned)), num_to_ignore)

def data_cleaning_test(data_cleaned, num_to_ignore = 5000):
    return removing_rare_words_test(lemmatisation(unification(data_cleaned)), num_to_ignore)
    
analyzer = sa()

In [24]:
data_cleaned = data_cleaning(training)

uni
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
lemma
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
removing
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
3

In [27]:
test_data_cleaned = data_cleaning_test(test)

uni
0
10000
20000
30000
lemma
0
10000
20000
30000
0
10000
20000
30000


In [30]:
data_cleaned.to_csv("cleaned_data/cleaned_train.csv")

In [31]:
test_data_cleaned.to_csv("cleaned_data/cleaned_test.csv")

In [36]:
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data_cleaned['text'])

X_train_count = count_vect.transform(data_cleaned['text'])
y_train = data_cleaned["attitude"]
X_val_count = count_vect.transform(test_data_cleaned['text']) #validation
y_val =test_data_cleaned["attitude"]

# Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

# Model 3: Logistic Regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.8679210526315789
lsvm using count vectors accuracy 0.9308684210526316




log reg count vectors accuracy 0.9353421052631579


In [39]:
tfidf = TfidfVectorizer(max_features=500, analyzer='word',ngram_range=(1,3))
tfidf.fit(data_cleaned['text'])

X_train_tfidf = tfidf.transform(data_cleaned['text'])
y_train = data_cleaned["attitude"]
X_val_tfidf = tfidf.transform(test_data_cleaned['text']) #validation
y_val =test_data_cleaned["attitude"]
# Model 1: Multinomial Naive Bayes Classifier

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print("naive bayes tfidf accuracy %s" %  accuracy_score(y_pred, y_val))

# Model 2: Linear SVM

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

# Model 3: logistic regression

logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))


naive bayes tfidf accuracy 0.8182105263157895
svm using tfidf accuracy 0.8583157894736843




log reg tfidf accuracy 0.8723947368421052
