# Sentiment Analysis using Random Forest and Naive Bayes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import tqdm
import bz2
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [2]:
# Load data from bz2 file
data = bz2.BZ2File('data/train.ft.txt.bz2')

In [3]:
# Read lines and decode them using utf-8
data = data.readlines()
data = [i.decode('utf-8') for i in data]

In [4]:
# See how data looks like
data[:5]

['__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n',
 "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '__label__2 Amazing!: This s

In [5]:
# Split the data into 2 parts: 1) Labels and 2) Text
def split_data(data):
    text = []
    labels=[]
    
    for x in tqdm.tnrange(100000):
        x = data[x]
        # extract labels 
        label = x[:10]
        # Remove labels from x
        x = x[11:]
        # Remove punctuations
        review = re.sub("<.*?>", "", x)
        # Filter words only, remove numbers
        review = re.sub('[^a-zA-Z]', ' ', review)
        # Convert all words into lower case
        review = review.lower()
        review = review.split()
        
        # Create a onject for wordnet lemmatizer
        lemmatizer = WordNetLemmatizer()
        review = [lemmatizer.lemmatize(x) for x in review if not x in set(stopwords.words('english'))]
        review = ' '.join(review)
        
        # Append cleaned text and label into list
        text.append(review)
        labels.append(label)
    return text, labels
        

In [6]:
cleaned_text, labels = split_data(data)

  


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [7]:
new_labels = []
for i in labels:
    if i=="__label__2": # Positive
        new_labels.append(0)
    elif i=="__label__1": # Negative
        new_labels.append(1)

In [11]:
print("Count of negative labels", new_labels.count(1))
print("Count of positive labels", new_labels.count(0))

Count of negative labels 48733
Count of positive labels 51267


In [12]:
# Create an dataframe using cleaned text and labels
dataset = pd.DataFrame(cleaned_text, columns=["Text"])
dataset["Label"] = new_labels

In [13]:
dataset.head()

Unnamed: 0,Text,Label
0,stuning even non gamer sound track beautiful p...,0
1,best soundtrack ever anything reading lot revi...,0
2,amazing soundtrack favorite music time hand in...,0
3,excellent soundtrack truly like soundtrack enj...,0
4,remember pull jaw floor hearing played game kn...,0


In [14]:
# View cleaned text
dataset.iloc[1,0]

'best soundtrack ever anything reading lot review saying best game soundtrack figured write review disagree bit opinino yasunori mitsuda ultimate masterpiece music timeless listening year beauty simply refuse fade price tag pretty staggering must say going buy cd much money one feel would worth every penny'

In [23]:
# Split the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:50000,0], dataset.iloc[:50000, 1:], test_size=0.25, random_state=0)

In [24]:
print(len(X_train))
print(len(X_test))

37500
12500


In [25]:
# Create an TF-IDF vectorizer for the cleaned text
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(max_features=6000) # Limiting features because of computation limitations (using CPU)

X_train_idf = tf_idf.fit_transform(X_train).toarray()
X_test_idf = tf_idf.transform(X_test).toarray()

In [26]:
# Shape of X_train
X_test_idf.shape

(12500, 6000)

In [27]:
# Lets do feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_idf = sc.fit_transform(X_train_idf)
X_test_idf = sc.transform(X_test_idf)

I think the reason why performing the feature scaling is important because we do not want 0 to affect the calculation. 
Because many of the columns will contain 0 as they are not present in all documents (i.e. rows). 
It is the reason why word embeddings were created in first place. 
To learn more about word embeddings please go to : <a href=https://towardsdatascience.com/what-the-heck-is-word-embedding-b30f67f01c81> Click Here </a>
<br/>

For this experiment, I am using BOW model using Tf-IDF vectorizer. 

In [30]:
# Create an Random Forest Tree Classifier 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=0, max_depth=100)
rfc.fit(X_train_idf, y_train)

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [31]:
# Predicting the Test set results
y_pred = rfc.predict(X_test_idf)

In [32]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [33]:
cm

array([[5462,  939],
       [1075, 5024]], dtype=int64)

In [41]:
# Print Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test , y_pred)
print("Accuracy of Random Forest Tree is: {:.2f}%".format(accuracy*100))

Accuracy of Random Forest Tree is: 83.89%


## Using Naive Bayes Algorithm

In [43]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(X_train_idf, y_train)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None, var_smoothing=1e-09)

In [44]:
# Predicting the Test Set Results
y_pred_naive = naive.predict(X_test_idf)

In [48]:
# Confusion matrix
cm_naive = confusion_matrix(y_test, y_pred_naive)
print(cm)

[[5462  939]
 [1075 5024]]


In [52]:
accuracy_naive = accuracy_score(y_test , y_pred_naive)
print("Accuracy of Naive Bayes is: {:.2f}%".format(accuracy_naive*100))

Accuracy of Naive Bayes is: 76.95%
