In [None]:
pip install --no-cache-dir --ignore-installed numpy pandas bs4 nltk scikit-learn matplotlib graphviz

# Part I: Bag of Words

In [None]:
#adapted from:
#https://github.com/sahilee26/IMDB-Movie-Reviews-Sentiment-Analysis/blob/master/Bag-of-words-random-forest.ipynb
#https://github.com/shiaoligreen/practical-data-science/tree/master/Bag%20of%20Words%20Meets%20Bags%20of%20Popcorn

# Load packages

#Generic tools
import re
import numpy as np

#Data pre-preprocessing
import pandas as pd  
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords as nltkstopwords
import nltk.data

#Data split and featurization
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

#Random forest classifier and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt

#Random forest visualization
import graphviz
from sklearn.tree import export_graphviz

#Multilayer Perceptron classifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification



In [None]:
# Load data
train = pd.read_csv("data/labeledTrainData.tsv", 
                    header=0, delimiter="\t", 
                    quoting=3)

test = pd.read_csv("data/testData.tsv", 
                   header=0, delimiter="\t",
                   quoting=3 )

unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, 
                              delimiter="\t", quoting=3 )

# Split data to train and test partitions
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns=['sentiment']), 
                                                    train.sentiment, test_size=0.2)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [None]:
# Download text datasets, including stop words
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords as nltkstopwords
#print(nltkstopwords.words('english'))
#print(nltkstopwords.words('german'))
#print(nltkstopwords.words('chinese'))
#print(nltkstopwords.words.__dir__)

In [None]:
def review_to_words(review, string=True, remove_stopwords=False):
    '''
    Function to convert a document to a sequence of words, 
    optionally removing stop words.  
    Returns a list of words.
    '''
    # Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # Convert words to lower case and split them
    words = review_text.lower().split()
    
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(nltkstopwords.words("english"))
        words = [w for w in words if not w in stops]
    if string:
        return " ".join(words)
    else:
        return words

In [None]:
def review_to_bigrams(review, remove_stopwords=False, add_startend_tokens=True):
    '''
    Function to convert a document to a sequence of word bigrams,
    optionally removing stop words.
    Returns a list of bigrams.
    '''
    #E.g., ["I", "liked", "this" ,"movie"] -> ["I liked", "liked this", "this movie"]
    #your code here     
    
    # Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # Convert words to lower case and split them
    words = review_text.lower().split()
    
    # Optionally add START and END tokens (True by default)
    if add_startend_tokens:
        words = ["START"] + words + ["END"]
        
    # Optionally remove stop words (False by default)
    if remove_stopwords:
        stops = set(nltkstopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # Generate bigrams
    bigrams = []
    for i in range(len(words)-1):
        bigrams.append(words[i] + " " + words[i+1])

    return bigrams

#print(review_to_bigrams(["I", "liked", "this", "movie"]))
my_bigrams =review_to_bigrams("I liked this movie")
print(my_bigrams)

In [None]:
# Get list of reviews
clean_train_reviews = [review_to_words(X_train["review"][i], remove_stopwords=True) for i in range(len(X_train))]
clean_test_reviews = [review_to_words(X_test["review"][i], remove_stopwords=True) for i in range(len(X_test))]

print(clean_train_reviews[0])

In [None]:
# Get lists of reviews using the bigram function instead of the review_to_words function

clean_train_bigram_reviews = [review_to_bigrams(X_train["review"][i]) for i in range(len(X_train))]
clean_test_bigram_reviews = [review_to_bigrams(X_test["review"][i]) for i in range(len(X_test))]

print(clean_train_bigram_reviews[0])

In [None]:
# Initialize a bag of words  
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000) #vocabulary size defined here, sorted by frequency - e.g., 5k most common terms.  How does model performance change if you increase/decrease this value?

# Fit transform the data
train_feat = vectorizer.fit_transform(clean_train_reviews).toarray()
test_feat = vectorizer.transform(clean_test_reviews).toarray()

In [None]:
train_feat.shape

In [None]:
test_feat.shape

In [None]:
# Take a look at the vocabulary
vocab = vectorizer.get_feature_names_out()
print(vocab[:100])

In [None]:
# Get predictions
def get_preds(test_feat, train_feat, y_test, y_train, model, title='Random Forest'):
    print("Training model, this may take some time...")
    model.fit(train_feat, y_train)
    
    print("Evaluating model...")
    y_preds = model.predict(test_feat)
    
    accuracy = accuracy_score(y_test, y_preds)
    print("Accuracy:", accuracy)

    #fpr, tpr, _ = roc_curve(y_test, preds)
    #roc_auc = auc(fpr, tpr)
    #print('AUC:', roc_auc)
    
    #F1 doesn't matter because of class balance here
    
    # plot AUC
    #plt.plot(fpr, tpr)
    #plt.title(title)
    #plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    #plt.xlabel('False Positive Rate')
    #plt.ylabel('True Positive Rate')
    #plt.show()
    
    return y_preds, model

In [None]:
preds_rf, model = get_preds(test_feat, train_feat, 
                  y_test, y_train, 
                  RandomForestClassifier(n_estimators = 100)) #How does performance changes if you increase/decrease the number of estimators (trees)?

## Try train in LogisticRegression model

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [None]:
preds_rf, model = get_preds(test_feat, train_feat, 
                  y_test, y_train, 
                  LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)) 

In [None]:
preds_rf, model = get_preds(test_feat, train_feat, 
                  y_test, y_train, 
                  LogisticRegression(penalty='l2', max_iter=500, C=0.1, random_state=42)) 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define a list of models with different parameters
models = [
    LogisticRegression(penalty='l2', max_iter=500, C=0.1, random_state=42),
    LogisticRegression(penalty='l2', max_iter=500, C=1, solver='liblinear', random_state=42),
    LogisticRegression(penalty='l2', max_iter=500, C=1, class_weight='balanced', random_state=42),
    LogisticRegression(penalty='l2', max_iter=1000, C=0.5, solver='saga', class_weight='balanced', random_state=42)
]

# Initialize variables to keep track of the best model and its score
best_model = None
best_score = 0

# Iterate through the list of models
for model in models:
    # Fit the model on the training data
    model.fit(train_feat, y_train)
    
    # Predict on the test data
    preds = model.predict(test_feat)
    
    # Calculate accuracy
    score = accuracy_score(y_test, preds)
    
    # Check if this model is the best so far
    if score > best_score:
        best_model = model
        best_score = score

# Output the best model and its score
print(f'Best Model: {best_model}')
print(f'Best Accuracy: {best_score}')


### linear svm model

In [None]:
preds_rf, model = get_preds(test_feat, train_feat, 
                  y_test, y_train, 
                  SGDClassifier(loss='hinge',max_iter=500,random_state=42)) 

In [None]:
import graphviz


In [None]:
 # visualize decision tree from classifier
for i in range(10):
    tree = model.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=vocab,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

In [None]:
print(preds_rf[0]) #first prediction for test item - model predicted positive sentiment
print(test_feat[0]) #first test review
print(clean_test_reviews[0])
print(type(y_test))
print(y_test.iloc[0])

In [None]:
pip install tensorflow

# Part II: Multilayer Perceptron

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
print("Training MLP classifier... this may take some time")
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,),random_state=1, max_iter=300, verbose=True).fit(train_feat, y_train)
print("Done!")

In [None]:
accuracy = mlp_clf.score(test_feat, y_test)
print("Accuracy: {}".format(accuracy))

In [None]:
mlp_clf.get_params()

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
print("Training deeper MLP classifier... this may take some time")
mlp_clf_deeper = MLPClassifier(hidden_layer_sizes=(500,250,500,),random_state=1, max_iter=300, verbose=True).fit(train_feat, y_train)
print("Done!")

In [None]:
accuracy = mlp_clf_deeper.score(test_feat, y_test)
print("Accuracy: {}".format(accuracy))

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
print("Training deeper MLP classifier... this may take some time")
mlp_clf_deeper = MLPClassifier(hidden_layer_sizes=(500,250,500,),random_state=1, max_iter=300, verbose=True).fit(train_feat, y_train)
print("Done!")