<a href="https://colab.research.google.com/github/Lyu-Sterkh/Sentiment-analysis/blob/main/Scores_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error, confusion_matrix
import re
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px

# Read in data

In [None]:
# Google Drive mounting
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/df_preprocessed.csv', index_col='Unnamed: 0')

In [None]:
df.head()

Unnamed: 0,Score,Text,Sentiment
0,5,bought several vitality canned dog food produc...,1
1,1,product arrived labeled jumbo salted peanut pe...,0
2,4,confection around century light pillowy citrus...,1
3,2,looking secret ingredient robitussin believe f...,0
4,5,great taffy great price wide assortment yummy ...,1


# Train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    df['Text'], df['Score'], 
                                                    train_size=0.75, test_size=0.25,
                                                    random_state=73)

In [None]:
type(X_train)

pandas.core.series.Series

In [None]:
X_train

3576    delicious unique flavor great like little tart...
119     started buying noticed year old cat already st...
3488    let face going get succulent chocolate world g...
2138    happybaby happymunchies baked organic cheese v...
5246    seed exactly advertised bought paks seed far u...
                              ...                        
353     sugar sweet denied advertised melt baked eithe...
4815    viewer pointed dog food contains pesticide gra...
428     title say great flavor hint spice excellent cr...
4364    extremely happy bisquick gluten free idea expe...
160     best tasting tuna pack make opinion make great...
Name: Text, Length: 3738, dtype: object

In [None]:
y_train

3576    5
119     5
3488    5
2138    5
5246    5
       ..
353     5
4815    1
428     5
4364    5
160     5
Name: Score, Length: 3738, dtype: int64

In [None]:
#X_train_full = sm.add_constant(X_train)
X_train_full

Unnamed: 0,const,Text
3576,1.0,delicious unique flavor great like little tart...
119,1.0,started buying noticed year old cat already st...
3488,1.0,let face going get succulent chocolate world g...
2138,1.0,happybaby happymunchies baked organic cheese v...
5246,1.0,seed exactly advertised bought paks seed far u...
...,...,...
353,1.0,sugar sweet denied advertised melt baked eithe...
4815,1.0,viewer pointed dog food contains pesticide gra...
428,1.0,title say great flavor hint spice excellent cr...
4364,1.0,extremely happy bisquick gluten free idea expe...


# Vectorization

## BOW

In [None]:
vectorizer = CountVectorizer(stop_words="english")

In [None]:
X_train_BOW = vectorizer.fit_transform(X_train)
X_test_BOW = vectorizer.transform(X_test)

In [None]:
print(X_train_BOW.shape, X_test_BOW.shape)

(3738, 9761) (1247, 9761)


In [None]:
X_train[100]

'plan making pitcher tea forget take pack oz still weak must consumed hour say much better brewing however idea stash purse lady pocket men would weird okay one packet little flavor oz water use two drink oz water eat something nature also packet raspberry pom box give oz drink sure worth rather drink lemon water equally bland free'

In [None]:
X_train_BOW[100]

<1x9761 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [None]:
X_train_BOW

<3738x9761 sparse matrix of type '<class 'numpy.int64'>'
	with 99116 stored elements in Compressed Sparse Row format>

## TF-IDF

In [None]:
vectorizer = TfidfVectorizer(stop_words="english")

In [None]:
X_train_TFIDF = vectorizer.fit_transform(X_train)
X_test_TFIDF = vectorizer.transform(X_test)

In [None]:
print(X_train_TFIDF.shape, X_test_TFIDF.shape)

(3738, 9761) (1247, 9761)


In [None]:
X_train_TFIDF

<3738x9761 sparse matrix of type '<class 'numpy.float64'>'
	with 99116 stored elements in Compressed Sparse Row format>

## BOW with bigram

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

X_train_BOW_bi = vectorizer.fit_transform(X_train)
X_test_BOW_bi = vectorizer.transform(X_test)

print(X_train_BOW_bi.shape, X_test_BOW_bi.shape)

(3738, 104468) (1247, 104468)


# Scores prediction

## Linear Regression

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train_BOW, y_train)

In [None]:
model.coef_

array([ 0.26445135,  0.13263933, -0.46583789, ..., -0.02799353,
       -0.2194422 ,  0.05660239])

In [None]:
y_pred = model.predict(X_test_BOW)

In [None]:
MAE = metrics.mean_absolute_error(y_test, y_pred)
MSE = metrics.mean_squared_error(y_test, y_pred)

In [None]:
MAE

2.0996605794608785

In [None]:
MSE

9.210956513839108

In [None]:
MAE / y_train.mean()

0.4929672285676002

In [None]:
model.fit(X_train_TFIDF, y_train)

In [None]:
y_pred = model.predict(X_test_TFIDF)

In [None]:
MAE = metrics.mean_absolute_error(y_test, y_pred)
MSE = metrics.mean_squared_error(y_test, y_pred)

In [None]:
MAE

1.2785389282894544

In [None]:
MSE

2.6856523244404165

In [None]:
MAE / y_train.mean()

0.30018079982073864

In [None]:
model.fit(X_train_BOW_bi, y_train)

In [None]:
y_pred = model.predict(X_test_BOW_bi)

In [None]:
MAE = metrics.mean_absolute_error(y_test, y_pred)
MSE = metrics.mean_squared_error(y_test, y_pred)

In [None]:
MAE

0.8489022511306452

In [None]:
MSE

1.3500937877372494

In [None]:
MAE / y_train.mean()

0.19930887599562538