In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [2]:
dataset = pd.read_csv('./dataset.csv')
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
def clean_text(text):

    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    text = re.sub(r'\d+', '', text)

    text = ' '.join([word for word in text.split() if word not in stop_words])

    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
df = dataset[['Text']]

In [7]:
df.head()

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...


In [8]:
df['Cleaned_text'] = df['Text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cleaned_text'] = df['Text'].apply(clean_text)


In [9]:
df.head()

Unnamed: 0,Text,Cleaned_text
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


In [14]:
def map_ratings(score):

    if score in [1 , 2] :
        return 0

    elif score in [3] :
        return 1
    elif score in [4 , 5]:
         return 2
    else :
         return NULL

In [15]:
df['Sentiment'] = dataset['Score'].apply(map_ratings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'] = dataset['Score'].apply(map_ratings)


In [19]:
df.head()

Unnamed: 0,Text,Cleaned_text,Sentiment
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,2
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,0
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...,2
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,0
4,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,2


In [20]:
df['Sentiment'].value_counts()

Sentiment
2    443777
0     82037
1     42640
Name: count, dtype: int64

In [183]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [184]:
vectorizer = TfidfVectorizer(max_features=50000 , ngram_range=(1, 2) , min_df=5 , max_df=0.8 , sublinear_tf=True )

In [185]:
from sklearn.model_selection import train_test_split

Features = df['Cleaned_text']
Labels = df['Sentiment']

x_train , x_test , y_train , y_test  = train_test_split(Features , Labels , test_size = 0.2)

In [186]:
x_train.shape , y_train.shape , x_test.shape , y_test.shape

((454763,), (454763,), (113691,), (113691,))

In [187]:
train_vec = vectorizer.fit_transform(x_train)

In [188]:
train_vec

<454763x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 21179567 stored elements in Compressed Sparse Row format>

In [119]:
from sklearn.naive_bayes import MultinomialNB

In [201]:
model = MultinomialNB(alpha=0.1)
model.fit(train_vec, y_train)

In [202]:
training_preds = model.predict(train_vec)
training_preds

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [203]:
from sklearn.metrics import classification_report, accuracy_score

In [204]:
train_acc = accuracy_score(y_train , training_preds)
train_acc

0.8704995789015377

In [189]:
test_vec = vectorizer.transform(x_test)

In [205]:
test_preds = model.predict(test_vec)
test_preds

array([2, 2, 2, ..., 0, 2, 2], dtype=int64)

In [206]:
test_acc = accuracy_score(y_test , test_preds)
test_acc

0.8623285924127679

In [229]:
txt = "i did not like this at all , should not have bpugh this!!!"

cleaned_txt = clean_text(txt)

vec_txt = vectorizer.transform([cleaned_txt])

pred = model.predict(vec_txt)

In [230]:
pred

array([2], dtype=int64)

In [208]:
from sklearn.linear_model import LogisticRegression


In [226]:
model = LogisticRegression(multi_class='multinomial' , solver='lbfgs', max_iter=1000)

In [227]:
model.fit(train_vec, y_train)



In [228]:
training_preds = model.predict(train_vec)
training_preds

array([2, 2, 0, ..., 2, 2, 2], dtype=int64)

In [212]:
train_acc = accuracy_score(y_train , training_preds)
train_acc

0.915485648568595

In [213]:
test_preds = model.predict(test_vec)
test_preds

array([2, 2, 2, ..., 1, 2, 2], dtype=int64)

In [214]:
test_acc = accuracy_score(y_test , test_preds)
test_acc

0.897925077622679

In [215]:
txt = "Not the worst thing I’ve ever had, but definitely far from the best. It exists somewhere in that forgettable middle ground."

cleaned_txt = clean_text(txt)

vec_txt = vectorizer.transform([cleaned_txt])

pred = model.predict(vec_txt)

In [216]:
pred

array([2], dtype=int64)

In [217]:
import joblib

In [231]:
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']