In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
import html
import string
import torch
from torch import nn
import torch.nn.functional as F
from torch import utils
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from html.parser import HTMLParser
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report 
from sklearn.metrics import f1_score, confusion_matrix
from wordcloud import WordCloud
from collections import Counter

torch.manual_seed(0)
np.random.seed(0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Part 1 Read in Data





In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head()

Unnamed: 0,rating,reviewText,summary
0,3,Plot Storyline: 5 StarsThis novel accomplished...,3 1/4 Stars
1,3,I did not like how EL ended this one. I don't ...,"It was going great, then just.... ended"
2,5,I love how old fashioned this family is - they...,LOVED ALL 4!
3,5,I loved this story - It's about two friends wh...,friends make the best lovers
4,1,"In the Dark Lands, a virus killed all possibil...",Blatantly sexist and homophobic


In [None]:
test.head(6)

Unnamed: 0,Id,reviewText,summary
0,0,"So much of this is so dated, it's not worth re...",Dated
1,1,"I liked this book overall, though I spent most...",Review
2,2,Since other reviewers gave info about the plot...,Quick and easy read
3,3,"I love, love, love Kiera and Gabriel's story! ...",WOW!
4,4,"This was a good twist, on a already great stor...",OMG... So good!
5,5,These stories put into perspectivewhat most do...,Truly remarkable.


#Part 2 Data Cleaning

## 2.1 Split data into train and test

In [None]:
# Train test split (0.8:0.2)
df_train, df_test = np.split(train.sample(frac=1, random_state=42), 
                                     [int(.8*len(train))])

## 2.2 clean up text in training data

In [None]:
# Combine reviewText and summary
df_train['combine'] = df_train['summary'] + " " + df_train['reviewText']

In [None]:
# load stop_word
stop_words = stopwords.words("english")

In [None]:
# Remove html
html_rem = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
df_train['combine'] = df_train['combine'].str.replace(html_rem, " ")

# Html character codes (i.e., &...;) are replaced with an ASCII equivalent
df_train['combine'] = html.unescape(df_train['combine'])

# Remove URL using RE matching 'https' following by any non-empty characters
df_train['combine'] = df_train['combine'].str.replace('https\S+','') 

# remove thing after @
df_train['combine'] = df_train['combine'].str.replace('@(?=\w+)\w+', '')

# remove punctuation
df_train['combine'] = df_train['combine'].str.translate(str.maketrans('', '', string.punctuation))

# convert to lower
df_train['combine'] = df_train['combine'].str.lower()

# tonkenize 
df_train['combine'] = df_train['combine'].apply(word_tokenize)

# remove stop words
df_train['combine'] = df_train['combine'].apply(lambda x: [item for item in x if item not in stop_words])

# lemmatisation
lemmatizer = WordNetLemmatizer() 
df_train['combine'] = df_train['combine'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


# ready for vectorize
df_train['text_cleaned_v'] = df_train['combine'].str.join(' ')

In [None]:
df_train[['text_cleaned_v']].head()

Unnamed: 0,text_cleaned_v
7940,good found book interesting intriguging underl...
1162,great romance good read loved storyline 2 main...
582,grerat love story series book 4 fantastic love...
4081,book religous rock used book information rock ...
8412,seducing jane porter story hot right start jan...


## 2.3 Clean up text in test data

In [None]:
# combine reviewText and summary
df_test['text'] = df_test['summary'] + " " + df_test['reviewText']

In [None]:
# Remove html
html_rem = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
df_test['text'] = df_test['text'].str.replace(html_rem, " ")

# Html character codes (i.e., &...;) are replaced with an ASCII equivalent
df_test['text'] = html.unescape(df_test['text'])

# Remove URL using RE matching 'https' following by any non-empty characters
df_test['text'] = df_test['text'].str.replace('https\S+','') 

# remove thing after @
df_test['text'] = df_test['text'].str.replace('@(?=\w+)\w+', '')

# remove punctuation
df_test['text'] = df_test['text'].str.translate(str.maketrans('', '', string.punctuation))

# convert to lower
df_test['text'] = df_test['text'].str.lower()

# tonkenize 
df_test['text'] = df_test['text'].apply(word_tokenize)

# remove stop words
df_test['text'] = df_test['text'].apply(lambda x: [item for item in x if item not in stop_words])

# lemmatisation
lemmatizer = WordNetLemmatizer() 
df_test['text'] = df_test['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


df_test['text_cleaned'] = df_test['text']

#ready for vectorize
df_test['text_cleaned_v'] = df_test['text'].str.join(' ')

In [None]:
df_test[['text_cleaned']].head()

Unnamed: 0,text_cleaned
520,"[autobiography, lt, cupcake, read, bubba, book..."
1763,"[wolf, ring, story, started, promising, ended,..."
1875,"[great, series, loved, series, atlanta, westmo..."
5355,"[disappointing, reading, bid, love, gave, 5, s..."
3965,"[good, book, look, book, author, hope, find, b..."


## 2.4 Check if target column contained null

In [None]:
# training set
if df_train['rating'].isnull().values.any() == False:
  print('Does rating in training set contains missing value: No') 
else:
  print('Does rating in training set contains missing value: Yes') 

df_train['rating'].value_counts()

Does rating in training set contains missing value: No


4    1924
5    1787
1    1355
2    1199
3     935
Name: rating, dtype: int64

## 2.5 Change the range of rating

In [None]:
df_train['rating'] = df_train['rating'] -1
df_test['rating'] = df_test['rating'] -1

# Part 3 Model preparation for Task A

## 3.1 BOW

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df = 2, max_df = 0.9)
vectorizer1 = count_vectorizer.fit_transform(df_train['text_cleaned_v'])

In [None]:
# show the BOW matrix
word_names1 = count_vectorizer.get_feature_names()
BOW_matrix = pd.DataFrame(vectorizer1.toarray(), columns=list(word_names1))
BOW_matrix

Unnamed: 0,00,000,05,099,099 worth,10,10 book,10 minute,10 page,10 star,...,zoomed,zorn,zorn warrior,zsadist,zsadist bella,zsadist bellas,zsadist one,zsadist story,zsadists,zyra
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3.2 TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df = 2, max_df = 0.9)
 
vectorizer2 = tfidf_vectorizer.fit_transform(df_train['text_cleaned_v'])


In [None]:
# show the TF-IDF matrix
word_names2 = tfidf_vectorizer.get_feature_names()
tfidf_matrix = pd.DataFrame(vectorizer2.toarray(), columns=list(word_names2))
tfidf_matrix

Unnamed: 0,00,000,05,099,099 worth,10,10 book,10 minute,10 page,10 star,...,zoomed,zorn,zorn warrior,zsadist,zsadist bella,zsadist bellas,zsadist one,zsadist story,zsadists,zyra
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3.3 Split data


In [None]:
# split training data into train and validation
bow_train, bow_val, y_train, y_val = train_test_split(vectorizer1, df_train['rating'], test_size=0.2, random_state=12)

tfidf_train, tfidf_val, y_train, y_val = train_test_split(vectorizer2, df_train['rating'], test_size=0.2, random_state=12)

In [None]:
# check the shape
print(tfidf_train.shape)
print(tfidf_val.shape)
print(y_train.shape)
print(y_val.shape)

(5760, 49583)
(1440, 49583)
(5760,)
(1440,)


# Part 4 Random Forest

## 4.1 Classification

### 4.1.1 with BOW

In [None]:
# set parameters
param = {'n_estimators': [80,100,120],
      'max_features':[200,250,300],
      'max_depth':[20,60,100]},

# use the grid of parameters on training data for Random Forest
grid = GridSearchCV(RandomForestClassifier(random_state=10),
                    param,cv=6,return_train_score=True)
grid.fit(bow_train,y_train)

GridSearchCV(cv=6, estimator=RandomForestClassifier(random_state=10),
             param_grid=({'max_depth': [20, 60, 100],
                          'max_features': [200, 250, 300],
                          'n_estimators': [80, 100, 120]},),
             return_train_score=True)

In [None]:
# get the best parameters
grid.best_params_

{'max_depth': 100, 'max_features': 200, 'n_estimators': 120}

In [None]:
# do Random Forest with the best parameters
rdf_class = RandomForestClassifier(n_estimators=120, max_features=200,max_depth=100,random_state=10)
rdf_class.fit(bow_train,y_train)
y_pred_rdfclass = rdf_class.predict(bow_val)

In [None]:
print(classification_report(y_val,y_pred_rdfclass))

              precision    recall  f1-score   support

           0       0.56      0.63      0.60       301
           1       0.46      0.17      0.25       250
           2       0.73      0.07      0.12       161
           3       0.44      0.62      0.51       393
           4       0.49      0.65      0.56       335

    accuracy                           0.49      1440
   macro avg       0.54      0.43      0.41      1440
weighted avg       0.51      0.49      0.45      1440



In [None]:
# Macro F1-score for Classification (BOW)
f1_rf_class1 = f1_score(y_val, y_pred_rdfclass, average ='macro')
f1_rf_class1

0.408482408236309

### 4.1.2 with TF-IDF

In [None]:
# set parameters
param = {'n_estimators': [80,100,120],
      'max_features':[200,250,300],
      'max_depth':[20,60,100]},

# use the grid of parameters on training data for Random Forest
grid = GridSearchCV(RandomForestClassifier(random_state=10),
                    param,cv=6,return_train_score=True)
grid.fit(tfidf_train,y_train)

GridSearchCV(cv=6, estimator=RandomForestClassifier(random_state=10),
             param_grid=({'max_depth': [20, 60, 100],
                          'max_features': [200, 250, 300],
                          'n_estimators': [80, 100, 120]},),
             return_train_score=True)

In [None]:
# get the best parameters
grid.best_params_

{'max_depth': 100, 'max_features': 250, 'n_estimators': 120}

In [None]:
# do Random Forest with the best parameters
rdf_class = RandomForestClassifier(n_estimators=120, max_features=250, max_depth=100, random_state=10)
rdf_class.fit(tfidf_train,y_train)
y_pred_rdfclass2 = rdf_class.predict(tfidf_val)

In [None]:
print(classification_report(y_val,y_pred_rdfclass2))

              precision    recall  f1-score   support

           0       0.56      0.60      0.58       301
           1       0.43      0.16      0.23       250
           2       0.60      0.07      0.13       161
           3       0.44      0.65      0.52       393
           4       0.52      0.64      0.58       335

    accuracy                           0.49      1440
   macro avg       0.51      0.43      0.41      1440
weighted avg       0.50      0.49      0.45      1440



In [None]:
# Macro F1-score for Classification (TF-IDF)
f1_rf_class2 = f1_score(y_val, y_pred_rdfclass2, average ='macro')
f1_rf_class2

0.4091041988178379

In [None]:
print('The F1-score for Random Forest (Classification) with BOW is',round(f1_rf_class1,4))
print('The F1-score for Random Forest (Classification) with TF-IDF is',round(f1_rf_class2,4))

The F1-score for Random Forest (Classification) with BOW is 0.4085
The F1-score for Random Forest (Classification) with TF-IDF is 0.4091


## 4.2 Regression

### 4.2.1 with BOW

In [None]:
# set parameters
param = {'n_estimators': [80,100,120],
      'max_features':[200,250,300],
      'max_depth':[20,60,100]},

# use the grid of parameters on training data for Random Forest
grid = GridSearchCV(RandomForestRegressor(random_state=10),
                    param,cv=6,return_train_score=True)
grid.fit(bow_train,y_train)

GridSearchCV(cv=6, estimator=RandomForestRegressor(random_state=10),
             param_grid=({'max_depth': [20, 60, 100],
                          'max_features': [200, 250, 300],
                          'n_estimators': [80, 100, 120]},),
             return_train_score=True)

In [None]:
# get the best parameters
grid.best_params_

{'max_depth': 100, 'max_features': 300, 'n_estimators': 120}

In [None]:
# do Random Forest with the best parameters
rdf_reg = RandomForestRegressor(n_estimators=120, max_features=300,max_depth=100, random_state=10)
rdf_reg.fit(bow_train,y_train)
y_pred_rdfreg = rdf_reg.predict(bow_val)

In [None]:
#check the class of y_pre_rdfreg
np.unique(np.round(y_pred_rdfreg))

array([0., 1., 2., 3., 4.])

In [None]:
y_pred_rdfreg = np.round(y_pred_rdfreg)

In [None]:
print(classification_report(y_val,y_pred_rdfreg))

              precision    recall  f1-score   support

           0       1.00      0.00      0.01       301
           1       0.30      0.25      0.27       250
           2       0.17      0.73      0.27       161
           3       0.41      0.54      0.47       393
           4       0.64      0.02      0.04       335

    accuracy                           0.28      1440
   macro avg       0.50      0.31      0.21      1440
weighted avg       0.54      0.28      0.22      1440



In [None]:
# Macro F1-score for Regression (BOW)
f1_rf_reg1 = f1_score(y_val, y_pred_rdfreg, average ='macro')
f1_rf_reg1

0.21194375908279023

### 4.2.2 with TF-IDF

In [None]:
# set parameters
param = {'n_estimators': [80,100,120],
      'max_features':[200,250,300],
      'max_depth':[20,60,100]},

# use the grid of parameters on training data for Random Forest
grid = GridSearchCV(RandomForestRegressor(random_state=10),
                    param,cv=6,return_train_score=True)
grid.fit(tfidf_train,y_train)

GridSearchCV(cv=6, estimator=RandomForestRegressor(random_state=10),
             param_grid=({'max_depth': [20, 60, 100],
                          'max_features': [200, 250, 300],
                          'n_estimators': [80, 100, 120]},),
             return_train_score=True)

In [None]:
# get the best parameters
grid.best_params_

{'max_depth': 100, 'max_features': 300, 'n_estimators': 120}

In [None]:
# do Random Forest with the best parameters
rdf_reg = RandomForestRegressor(n_estimators=120, max_features=300, max_depth=100, random_state=10)
rdf_reg.fit(tfidf_train,y_train)
y_pred_rdfreg2 = rdf_reg.predict(tfidf_val)

In [None]:
#check the class of y_pre_rdfreg2
np.unique(np.round(y_pred_rdfreg2))

array([1., 2., 3., 4.])

In [None]:
y_pred_rdfreg2 = np.round(y_pred_rdfreg2)

In [None]:
print(classification_report(y_val,y_pred_rdfreg2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       301
           1       0.29      0.22      0.25       250
           2       0.16      0.68      0.26       161
           3       0.42      0.59      0.49       393
           4       0.75      0.02      0.03       335

    accuracy                           0.28      1440
   macro avg       0.32      0.30      0.21      1440
weighted avg       0.36      0.28      0.21      1440



In [None]:
# Macro F1-score for Classification (TF-IDF)
f1_rf_reg2 = f1_score(y_val, y_pred_rdfreg2, average ='macro')
f1_rf_reg2

0.20677710740049138

In [None]:
print('The F1-score for Random Forest (Regression) with BOW is',round(f1_rf_reg1,4))
print('The F1-score for Random Forest (Regression) with TF-IDF is',round(f1_rf_reg2,4))

The F1-score for Random Forest (Regression) with BOW is 0.2119
The F1-score for Random Forest (Regression) with TF-IDF is 0.2068


Therefore, the highest F1-score for Random Forest is Random Forest (Classification) with TF-IDF.

## 4.3 Test for the chosen type of RF

In [None]:
# combine train and validation data
tfidf_train = tfidf_train.toarray()
tfidf_val = tfidf_val.toarray()

y_train = pd.DataFrame(y_train)  # series to dataframe
y_train = y_train.values     # dataframe to array
y_val = pd.DataFrame(y_val)   
y_val = y_val.values      


tfidf_combine = np.append(tfidf_train,tfidf_val,axis=0)
y_combine = np.append(y_train,y_val,axis=0)

In [None]:
# test for the chosen Random Forest (Classification) with TF-IDF
tfidf_test = tfidf_vectorizer.transform(df_test['text_cleaned_v'])

rdf_class = RandomForestClassifier(n_estimators=120, max_features=250, max_depth=100,random_state=10)
rdf_class.fit(tfidf_combine,y_combine)
y_pred_rdfclass3 = rdf_class.predict(tfidf_test)

In [None]:
# Macro F1-score for testing data in RF (Classification) with TF-IDF
f1_rf_class33 = f1_score(df_test['rating'], y_pred_rdfclass3, average ='macro')
f1_rf_class33.round(4)

0.4014