# Load

In [1]:
pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt

trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv")

print("train.csv shape is ", trainingSet.shape)
print("test.csv shape is ", testingSet.shape)

train.csv shape is  (139753, 9)
test.csv shape is  (13976, 2)


In [26]:
trainingSet['Score'].value_counts()

5.0    67188
4.0    28572
3.0    14857
1.0     7593
2.0     7567
Name: Score, dtype: int64

# Feature Extraction

In [3]:
def process(df):
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)
    df['ReviewLength'] = df.apply(lambda row : len(row['Text'].split()) if type(row['Text']) == str else 0, axis = 1)
    return df

# Load the dataset
trainingSet = pd.read_csv("./data/train.csv")

# Process the DataFrame
train_processed = process(trainingSet)

# Load test set
submissionSet = pd.read_csv("./data/test.csv")

# Merge on Id so that the test set can have feature columns as well
testX= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
testX = testX.drop(columns=['Score_x'])
testX = testX.rename(columns={'Score_y': 'Score'})

# The training set is where the score is not null
trainX =  train_processed[train_processed['Score'].notnull()]

# Save the datasets with the new features for easy access later
testX.to_csv("./data/X_test.csv", index=False)
trainX.to_csv("./data/X_train.csv", index=False)

In [4]:
testX.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Helpfulness,ReviewLength,Score
0,1224650,B001OQCV6A,A3HUN3KP383B23,0,1,1395619200,Sherlock Holmes,I bought this thinking it was Sherlock Holmes ...,0.0,47,
1,1019381,B000I8OFLO,A2VJ80PM1G00QV,2,2,1222300800,"FINE SINGING, DESTRUCTIVE STAGING",1. Scenic design and the Directors use of it p...,1.0,160,
2,504719,B00000IQCC,A2EGK0YRDF4ZZB,11,11,1018310400,Moving story of Jesus's message in a modern re...,J&eacute;sus de Montr&eacute;al was a stunni...,1.0,999,
3,1622425,B00B5UBDA0,A21I62TCDL4754,0,0,1372118400,Paul Anka has aged well and performs well,I was looking for Blu Ray concerts for my new ...,0.0,31,
4,482286,6305892946,A1IQ9E6I3PIUFF,1,2,969321600,Great end to the series.Or is it???????????,Despite what everyone else says this movie is ...,0.5,30,


# Text Data Preprocess + TFIDF

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NEE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NEE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas DataFrame
data = trainX.loc[:]
data.loc[30004,'Summary'] = ' '
data.loc[100011,'Text'] = ' '
test = testX.loc[:, ['Summary', 'Text', 'Score']]
# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
def preprocess_text(text):
    words = re.sub(r'[^\w\s]', '', text)
    words = words.lower()
    words = words.split()
    words = [word for word in words if word not in stop_words and word.isalpha()]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)
data['Review'] = data['Summary'] + ' ' + data['Text']
test['Review'] = test['Summary'] + ' ' + test['Text']

data['Review'] = data['Review'].apply(preprocess_text)
test['Review'] = test['Review'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, data["Score"], test_size=0.2, random_state=42)
print(X_test.shape)

# Convert the text data into numerical features using TF-IDF
Xtrain = X_train['Review']
Xtest = X_test['Review']
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=9000)
Xtrain = vectorizer.fit_transform(Xtrain)
Xtest = vectorizer.transform(Xtest)
sample = vectorizer.transform(test['Review'])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NEE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(25156, 12)


In [8]:
sample

<13976x9000 sparse matrix of type '<class 'numpy.float64'>'
	with 913424 stored elements in Compressed Sparse Row format>

# Sentiment Analysis Model: LGBM + Naive Bayes

In [9]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
# Instantiate the LightGBM regressor
lgb_regressor = lgb.LGBMRegressor(num_leaves = 200,random_state=42)
lgb_regressor.fit(Xtrain, y_train)
y_lgb = lgb_regressor.predict(Xtest)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_lgb)
print(f"Mean squared error: {mse:.2f}")

Mean squared error: 0.70


In [10]:
y_lgb

array([4.38982116, 3.12718382, 4.69584868, ..., 3.07332093, 4.3725948 ,
       4.52553703])

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
# dataset
Xtrain, Xtest, y_train, y_test

# create a Naive Bayes classifier
nb = MultinomialNB(alpha = 0.0005,fit_prior = False)

# create an AdaBoost classifier with 100 estimators and a learning rate of 0.18
boost_clf = AdaBoostClassifier(base_estimator=nb, n_estimators=100, learning_rate = 0.18)

# train the AdaBoost classifier on the training data
boost_clf.fit(Xtrain, y_train)

# predict the classes of the test data
y_pred = boost_clf.predict(Xtest)

# prin metrics
accuracy = (y_pred == y_test).sum() / len(y_test)
print(f"Accuracy: {accuracy:.2f}")
report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{report}")
print(confusion_matrix(y_test,y_pred))
print('MSE: ',np.mean(np.square(y_pred - y_test)))

Accuracy: 0.63
Classification Report:
              precision    recall  f1-score   support

         1.0       0.62      0.52      0.57      1517
         2.0       0.38      0.25      0.30      1539
         3.0       0.44      0.35      0.39      2964
         4.0       0.44      0.37      0.40      5654
         5.0       0.74      0.86      0.79     13482

    accuracy                           0.63     25156
   macro avg       0.52      0.47      0.49     25156
weighted avg       0.61      0.63      0.61     25156

[[  789   284   129    65   250]
 [  228   382   445   213   271]
 [  108   225  1031   879   721]
 [   52    69   510  2083  2940]
 [   94    58   236  1474 11620]]
MSE:  0.8907218953728733


In [12]:
y_pred

array([4., 4., 5., ..., 5., 4., 5.])

In [13]:
#combine the models and comput the weight for reducing mse
para_mse = {}
for i in range(1,6):
    para_mse[str(i)] = sum((np.square(y_test - (y_pred  + y_lgb * i) / (i+1))))/ len(y_test)
para_mse

{'1': 0.6652495937233721,
 '2': 0.6470976683091553,
 '3': 0.6487102384399707,
 '4': 0.6530981110266044,
 '5': 0.657448497129416}

In [14]:
#combined predictions for new feature
train_predict = (boost_clf.predict(Xtrain) + lgb_regressor.predict(Xtrain) * 2) / 3
test_predict = (y_pred + y_lgb * 2) / 3
print(train_predict.shape, test_predict.shape)

(100621,) (25156,)


In [27]:
np.mean(np.square(test_predict - y_test))

0.6470976683091553

# Error Model : LGBM

In [15]:
# add new feature
X_train['predict_error'] = y_train - train_predict
X_test['predict_error'] = y_test - test_predict
X_train.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Score,Helpfulness,ReviewLength,Review,predict_error
14587,748290,B00008WFHE,A3Q3DPQ38DXI4C,0,0,1215302400,Fascinating,I have watched this DVD several times with my ...,5.0,0.0,173,fascinating watched dvd several time four mont...,0.236299
124883,225509,6300247120,A1ARZLZRS4JZX7,5,5,1172793600,They can't make 'em like this anymore.,"""Breaking Away"" is a true rarity: a feel-good ...",5.0,1.0,125,cant make em like anymore breaking away true r...,0.473899
44292,1027963,B000JLTR8Q,AXMVY1CH5125B,14,54,1164672000,I'll never trust Amazon's reviewers again.,I honestly don't know how this waste of cellul...,1.0,0.259259,63,ill never trust amazon reviewer honestly dont ...,-0.353568
52064,1010387,B000H5V8H2,A31ARSC1LGY8WK,2,2,1205884800,Paying the ultimate price for your principles,This film chronicles the final few days - lite...,4.0,1.0,145,paying ultimate price principle film chronicle...,0.0524
81112,80876,0788812483,A1DD15RXIXWENM,0,0,1292544000,Disney and Dogs..,in few words:- a true story- beautiful and hea...,4.0,0.0,26,disney dog word true story beautiful heartbrea...,-0.526201


In [16]:
new_Xtrain = X_train[['Helpfulness','ReviewLength']]
new_ytrain = X_train['predict_error']
new_Xtest = X_test[['Helpfulness','ReviewLength']]
new_ytest = X_test['predict_error']

In [17]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
# Instantiate the LightGBM regressor
err_lgb = lgb.LGBMRegressor(random_state=42)
err_lgb.fit(new_Xtrain, new_ytrain)
y_err = err_lgb.predict(new_Xtest)

# Calculate the mean squared error
mse = mean_squared_error(new_ytest, y_err)
print(f"Mean squared error: {mse:.2f}")

Mean squared error: 0.61


In [18]:
# mse for TestSet
final_pred = y_err + test_predict
final =  np.where(final_pred>5, 5, final_pred)
final = np.where(final<0, 0, final)
mean_squared_error(final, y_test)

0.607657110181691

In [19]:
# final result
result = (boost_clf.predict(sample) + lgb_regressor.predict(sample) * 2) / 3
err = err_lgb.predict(testX[['Helpfulness','ReviewLength']])
result = result + err

In [20]:
result.shape

(13976,)

# Submission

In [23]:
X_submission = pd.read_csv("./data/X_test.csv")
X_submission['Score'] = result
submission = X_submission[['Id', 'Score']]
submission.to_csv("./data/submission.csv", index=False)