In [None]:
import os
import pandas as pd
import sklearn as sk
from bs4 import BeautifulSoup
import re,string,unicodedata
import nltk 
nltk.download('stopwords')

In [None]:
#import the data
data = pd.read_csv("../res/Dataset.csv")
data.head(10)

print(data.shape)
print(data['sentiment'].value_counts())


In [None]:
def cleanup(text):
    # Remove html tags
    text = BeautifulSoup(text, "html.parser").getText()
    # Remove square brackets 
    text = re.sub('\[[^]]*\]', '', text)
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    # Remove Stopwords
    #text = text.lower().split()
    # Stemming
    ps = nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])

    return text
    
tokenizer = nltk.tokenize.toktok.ToktokTokenizer()
data['review'] = data['review'].apply(cleanup)

In [None]:
stops = nltk.corpus.stopwords.words("english")

#removing the stopwords
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stops]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
data['review'] = data['review'].apply(remove_stopwords)

In [None]:
#shuffle the data
data_norm_shuffle = data.sample(frac=1).reset_index(drop=True)

#split the data
split = 30000

train_reviews = data_norm_shuffle['review'][:split]
train_sentiments = data_norm_shuffle['sentiment'][:split]

test_reviews = data_norm_shuffle['review'][split:].reset_index(drop=True)
test_sentiments = data_norm_shuffle['sentiment'][split:].reset_index(drop=True)


print(train_reviews.shape, train_sentiments.shape)
print(train_reviews[0][0:50]," : ", train_sentiments[0])
print(test_reviews.shape, test_sentiments.shape)
print(test_reviews[0][0:50]," : ", test_sentiments[0])

In [None]:
#Count vectorizer for bag of words
cv = sk.feature_extraction.text.CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))
train_reviews_vector = cv.fit_transform(train_reviews)
test_reviews_vector = cv.transform(test_reviews)

print(train_reviews_vector.shape)
print(test_reviews_vector.shape)

In [None]:
#Tfidf vectorizer
tv = sk.feature_extraction.text.TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))
train_reviews_tvector = tv.fit_transform(train_reviews)
test_reviews_tvector = tv.transform(test_reviews)

print(train_reviews_tvector.shape)
print(test_reviews_tvector.shape)

In [None]:
#vectorize the sentient data
lb = sk.preprocessing.LabelBinarizer()
train_sentiments_vector = lb.fit_transform(data_norm_shuffle['sentiment'])

#split the data
train_sentiments = train_sentiments_vector[:split]
test_sentiments = train_sentiments_vector[split:]
print(train_sentiments.shape)
print(test_sentiments.shape)

In [None]:
def learingmethod(string, model, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments):
    model.fit(train_reviews_vector, train_sentiments)
    predictions = model.predict(test_reviews_vector)
    print(string)
    print("Accuracy: ", sk.metrics.accuracy_score(test_sentiments, predictions))
    print("Precision: ", sk.metrics.precision_score(test_sentiments, predictions))
    print("Recall: ", sk.metrics.recall_score(test_sentiments, predictions))
    print("F1: ", sk.metrics.f1_score(test_sentiments, predictions))
    print("Confusion Matrix: ", sk.metrics.confusion_matrix(test_sentiments, predictions))
    print("Classification Report: ", sk.metrics.classification_report(test_sentiments, predictions))

In [None]:
lr = sk.linear_model.LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)
learingmethod("LinearRegression BOW", lr, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("LinearRegression TFIDF", lr, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

In [None]:
svm = sk.linear_model.SGDClassifier(loss='hinge', max_iter=500, random_state=42)
learingmethod("SGDClassifier BOW", lr, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("SGDClassifier TFIDF", lr, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)


In [None]:
mnb = sk.naive_bayes.GaussianNB()
learingmethod("GaussianNB BOW", lr, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("GaussianNB TFIDF", lr, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)
