In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

raw_data_frame = pd.read_csv('C:\\Users\\kp\\Pictures\\Assignments\\Text Mining\\train_data_20000.csv',header=0)

In [5]:
raw_data_frame.columns=raw_data_frame.columns.str.replace(" ", "")
raw_data_frame['Review_Rating'] = raw_data_frame['Review_Rating'].astype('float64')
raw_data_frame['SentimentClass'] = raw_data_frame['SentimentClass'].astype('str')
raw_data_frame['Review_Text'] = raw_data_frame['Review_Text'].astype('str')
raw_data_frame = raw_data_frame.fillna(0)
raw_data_frame.drop(raw_data_frame[raw_data_frame['Review_Rating'] == 0].index, axis=0, inplace=True)
raw_data_frame.drop(raw_data_frame[raw_data_frame['SentimentClass'] == 0].index, axis=0, inplace=True)
raw_data_frame.drop(raw_data_frame[raw_data_frame['Review_Text'] == 0].index, axis=0, inplace=True)
raw_data_frame.drop_duplicates(subset='Review_Text', keep='first', inplace=True)
raw_data_frame = raw_data_frame.reset_index()
raw_data_frame.Review_Text = raw_data_frame.Review_Text.str.lower()
raw_data_frame['Review_Text'] = raw_data_frame['Review_Text'].str.replace("'s'", "")
raw_data_frame["Review_Text"] = raw_data_frame['Review_Text'].apply(lambda record: word_tokenize(record))
stop_words = set(stopwords.words("english"))
exclude_stop_words = ['no', 'not', "don't", "aren't", 'ain', "shouldn't", "haven't", "hadn't"]
stop_words = [word for word in stop_words if word not in set(exclude_stop_words)]
raw_data_frame['Review_Text'] = raw_data_frame['Review_Text'].apply(lambda record: [word for word in record if word not in stop_words])
    
def apply_lemmatization(string_list):

    lem = WordNetLemmatizer()
    list = []

    for word in string_list:
        list.append(lem.lemmatize(word, "v"))

    return list

raw_data_frame['Review_Text'] = raw_data_frame['Review_Text'].apply(apply_lemmatization)
raw_data_frame.Review_Text = raw_data_frame.Review_Text.apply(lambda record: " ".join(record))
raw_data_frame["Review_Text"] = raw_data_frame['Review_Text'].apply(lambda x: re.sub('[^A-Za-z" "]+', "", x))
class_codes = {'Negative': -1, 'Neutral': 0, 'Positive': 1}
raw_data_frame['Class_Code'] = raw_data_frame['SentimentClass']
raw_data_frame = raw_data_frame.replace({'Class_Code': class_codes})
processed_data_frame=raw_data_frame

X_train, X_test, y_train, y_test = train_test_split(processed_data_frame['Review_Text'],
                                                    processed_data_frame['Class_Code'], test_size=0.15,random_state=8)

tfidf = TfidfVectorizer(encoding='utf-8',
                                ngram_range=(1, 2),
                                stop_words=None,
                                lowercase=False,
                                max_df=1.0,
                                min_df=10,
                                max_features=800,
                                norm='l2',
                                sublinear_tf=True)

predictors_train = tfidf.fit_transform(X_train).toarray()
class_labels_train = y_train

predictors_test = tfidf.transform(X_test).toarray()
class_labels_test = y_test

In [6]:
rfc=RandomForestClassifier(n_estimators=100,criterion='gini',bootstrap='True',random_state=42)
    
rfc.fit(predictors_train, class_labels_train)
rfc_pred = rfc.predict(predictors_test)
    
print("The Training Accuracy is: ")
print(accuracy_score(class_labels_train, rfc.predict(predictors_train)))
    
print("The Test Accuracy is: ")
print(accuracy_score(class_labels_test, rfc_pred))
    
conf_matrix = confusion_matrix(class_labels_test, rfc_pred)
print(conf_matrix)
print(classification_report(class_labels_test,rfc_pred))

The Training Accuracy is: 
0.9897571345745023
The Test Accuracy is: 
0.833941605839416
[[ 370   24  127]
 [  67   39  128]
 [  82   27 1876]]
              precision    recall  f1-score   support

          -1       0.71      0.71      0.71       521
           0       0.43      0.17      0.24       234
           1       0.88      0.95      0.91      1985

    accuracy                           0.83      2740
   macro avg       0.68      0.61      0.62      2740
weighted avg       0.81      0.83      0.82      2740

