## Part IV bag of words NLP rating model

### 4.1 modeling using tfidf transformed data

In [1]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

In [3]:
#load data and scale data
X_train=pd.read_csv('data/tfidf_train_X.csv')
y_train=pd.read_csv('data/y_train.csv')
X_test=pd.read_csv('data/tfidf_test_X.csv')
y_test=pd.read_csv('data/y_test.csv')

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
st_scaler=StandardScaler()
X_train_s=st_scaler.fit_transform(X_train)
X_test_s=st_scaler.transform(X_test)

In [6]:
X_train_s.shape

(41191, 298)

In [7]:
y_train.shape

(41191, 1)

#### 4.1.1 Regularized Linear regression model

 ElasticNet regularization applies both L1-norm and L2-norm regularization to penalize the coefficients in a regression model.  ElasticNetCV is a cross-validation class that can search multiple alpha values and applies the best one. We'll define the model with alphas value and fit it with xtrain and ytrain data. More details about ElasticNet model can be found [here](https://machinelearningmastery.com/elastic-net-regression-in-python/).

In [8]:
import warnings
warnings.filterwarnings("ignore")
import time
from numpy import arange
from sklearn.linear_model import ElasticNet, ElasticNetCV
alphas = [0.0001, 0.001, 0.01, 0.1, 0.4, 0.8, 1, 10, 100]
l1_ratios = arange(0, 1, 0.01)
start=time.time()
en_cv=ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=5, n_jobs=-1)
best_m = en_cv.fit(X_train_s, y_train)
end=time.time()
print(best_m.alpha_)
print(best_m.l1_ratio_)
print(f"Runtime of the program is {end - start}")

0.01
0.28
Runtime of the program is 211.95075511932373


In [10]:
y_pre = best_m.predict(X_test_s)
mse=mean_squared_error(y_test, y_pre)
mae=mean_absolute_error(y_test, y_pre)
r2 = r2_score(y_test, y_pre)
print('MSE: %f' % mse)
print('MAE:%f'% mae)
print('R^2: %f' % r2)

MSE: 6.117673
MAE:1.993788
R^2: 0.429545


#### 4.1.2 Decision tree model

In [11]:
param_grid = {'max_depth':[4,8,12, 20, 100, 200]}
start=time.time()
tree_cv = RandomizedSearchCV(DecisionTreeRegressor(criterion='mse'), param_grid , cv=5, random_state=42)
tree_cv.fit(X_train_s,y_train)
end=time.time()
print("BEST PARAMS", tree_cv.best_params_)
print(f"Runtime of the program is {end - start}")

BEST PARAMS {'max_depth': 100}
Runtime of the program is 140.427734375


In [12]:
def DecisionTreeRegressor_R(train_x, train_y, test_x, test_y, max_depth):
    decision = DecisionTreeRegressor(criterion='mse', max_depth=max_depth)
    decision.fit(train_x, train_y)
    train_pred = decision.predict(train_x)
    test_pred = decision.predict(test_x)
    test_mse = mean_squared_error(test_y, test_pred)
    test_mae = mean_absolute_error(test_y, test_pred)
    test_r2 = r2_score(test_y, test_pred)      
    print("test set results: MSE: %f, MAE: %f, R^2: %f" % (test_mse, test_mae, test_r2))     

In [13]:
DecisionTreeRegressor_R(X_train_s, y_train, X_test_s, y_test, 100)

test set results: MSE: 5.030418, MAE: 0.970715, R^2: 0.530928


#### 4.1.3 support vector machine for regression

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')

### 4.2 model using tfidf LSA transformed data

#### 4.2.1 regularized linear regression

In [15]:
#load data and scale data
X_train=pd.read_csv('data/tfidf_lsa_train_X.csv')
X_test=pd.read_csv('data/tfidf_lsa_test_X.csv')
X_train_s=st_scaler.fit_transform(X_train)
X_test_s=st_scaler.transform(X_test)

In [16]:
start=time.time()
en_cv=ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=5, n_jobs=-1)
best_m = en_cv.fit(X_train_s, y_train)
end=time.time()
print(best_m.alpha_)
print(best_m.l1_ratio_)
print(f"Runtime of the program is {end - start}")

0.001
0.86
Runtime of the program is 57.23660707473755


In [17]:
y_pre = best_m.predict(X_test_s)
mse=mean_squared_error(y_test, y_pre)
mae=mean_absolute_error(y_test, y_pre)
r2 = r2_score(y_test, y_pre)
print('MSE: %f' % mse)
print('MAE:%f'% mae)
print('R^2: %f' % r2)

MSE: 6.382285
MAE:2.043237
R^2: 0.404871


#### 4.2.2 Decision tree

In [18]:
param_grid = {'max_depth':[4,8,12, 20, 100, 200]}
start=time.time()
tree_cv = RandomizedSearchCV(DecisionTreeRegressor(criterion='mse'), param_grid , cv=5, random_state=42)
tree_cv.fit(X_train_s,y_train)
end=time.time()
print("BEST PARAMS", tree_cv.best_params_)
print(f"Runtime of the program is {end - start}")

BEST PARAMS {'max_depth': 100}
Runtime of the program is 139.65390825271606


In [19]:
DecisionTreeRegressor_R(X_train_s, y_train, X_test_s, y_test, 100)

test set results: MSE: 5.282316, MAE: 1.010309, R^2: 0.507440


#### 4.2.3 support vector machine

There are three important hyperparameters in SVM:  “kernel”, “gamma” and “C”
kernel:rbf, poly, sigmoid
gamma: Higher the value of gamma, higher possibility of overfitting.
C: Penalty parameter of the error term. It helps to controls the trade-off between accuracy and overfitting

In [22]:
from sklearn.svm import SVR
start=time.time()
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train_s, y_train)
y_pred = regressor.predict(X_test_s)
end=time.time()
print(f"Runtime of the program is {end - start}")

Runtime of the program is 904.2150647640228


In [23]:
mse=mean_squared_error(y_test, y_pre)
mae=mean_absolute_error(y_test, y_pre)
r2 = r2_score(y_test, y_pre)
print('MSE: %f' % mse)
print('MAE:%f'% mae)
print('R^2: %f' % r2)

MSE: 6.382285
MAE:2.043237
R^2: 0.404871


SVM is very time-consuming.

I need give up developing NLP regression model using ensembled algorithms like random forest or gradient boosting because they are really time-consuming. Let's try convert the problem into classification. We will convert the rating data into three levels, positive and negative. Hopefully, it can make our life easier.

In [30]:
y_train.loc[(y_train['rating'] >= 5), 'rating_class'] = 1
y_train.loc[(y_train['rating'] < 5), 'rating_class'] = 0
y_train.head()

Unnamed: 0,rating,rating_class
0,10.0,1.0
1,5.0,1.0
2,5.0,1.0
3,9.0,1.0
4,9.0,1.0


In [31]:
y_test.loc[(y_test['rating'] >= 5), 'rating_class'] = 1
y_test.loc[(y_test['rating'] < 5), 'rating_class'] = 0
y_test.head()

Unnamed: 0,rating,rating_class
0,10.0,1.0
1,8.0,1.0
2,3.0,0.0
3,1.0,0.0
4,9.0,1.0


In [32]:
y_train_c=y_train.drop(columns=['rating'])
y_test_c=y_test.drop(columns=['rating'])
y_train_c.head()

Unnamed: 0,rating_class
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0


In [38]:
y_train_c.to_csv('data/y_train_c', index=False)
y_test_c.to_csv('data/y_test_c', index=False)