In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import math
import warnings
warnings.filterwarnings('ignore') # Hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)
sns.set_style("whitegrid") # Plotting style
np.random.seed(7) # seeding random number generator

In [54]:
train = pd.read_csv('/content/drive/MyDrive/Reviews.csv')
train.shape

(568454, 10)

In [55]:
train.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [57]:
train.dropna(inplace=True)

In [58]:
train.shape

(568411, 10)

In [59]:
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(<br />)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df
train_clean = clean_text(train, "Text")

In [61]:
train_clean_df = train_clean[['Text','Score']]

In [63]:
train_clean_df.head()

Unnamed: 0,Text,Score
0,i have bought several of the vitality canned d...,5
1,product arrived labeled as jumbo salted peanut...,1
2,this is a confection that has been around a fe...,4
3,if you are looking for the secret ingredient i...,2
4,great taffy at a great price there was a wide...,5


In [69]:
sentiment = train_clean_df['Score'].apply(lambda x: 1 if x > 3 else 0)
train_clean_df['label'] = sentiment
train_clean_df.head()

Unnamed: 0,Text,Score,Sentiment,label
0,i have bought several of the vitality canned d...,5,1,1
1,product arrived labeled as jumbo salted peanut...,1,0,0
2,this is a confection that has been around a fe...,4,1,1
3,if you are looking for the secret ingredient i...,2,0,0
4,great taffy at a great price there was a wide...,5,1,1


In [70]:

#balancing the data
from sklearn.utils import resample
train_majority = train_clean_df[train_clean_df.label==1]
train_minority = train_clean_df[train_clean_df.label==0]
train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

1    443766
0    443766
Name: label, dtype: int64

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['Text'],train_upsampled['label'],random_state = 0)

In [73]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.8604748790197951

In [156]:
text = ("I love eating them and they are good for watching TV and looking at movies! It is not too sweet. I like to transfer them to a zip lock baggie so they stay fresh so I can take my time eating them.")
import re
def  cleanText(text):
     text= text.lower()
     text =re.sub(r"(<br />)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
     return text
cleanText(text)


'i love eating them and they are good for watching tv and looking at movies it is not too sweet i like to transfer them to a zip lock baggie so they stay fresh so i can take my time eating them'

In [157]:
doc = [text]
doc


['I love eating them and they are good for watching TV and looking at movies! It is not too sweet. I like to transfer them to a zip lock baggie so they stay fresh so I can take my time eating them.']

In [158]:
model.predict(doc)

array([1])