# Analyse sentiment :regression linéaire

import dependencies


In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag


In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cardinal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
#load the data from csv file to pandas dataframe
df = pd.read_excel(r"./DisneylandReviews.xlsx")


In [5]:
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [6]:
#check number rows and columns
df.shape

(42656, 6)

In [7]:
#counting number of missing values
df.isnull().sum()

Review_ID            0
Rating               0
Year_Month           0
Reviewer_Location    0
Review_Text          0
Branch               0
dtype: int64

In [8]:
#checking distribution of traget column
df['Rating'].value_counts()

Rating
5    23146
4    10775
3     5109
2     2127
1     1499
Name: count, dtype: int64

Stemming is the process of reducing a word to its Root word
example:actor,actress,acting =act


In [9]:
port_stem=PorterStemmer()

In [10]:
def stemming(content):
    
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
    
    

In [11]:
df['stemmed'] = df['Review_Text'].apply(stemming)

In [12]:
print (df['stemmed'])

0        ever disneyland anywher find disneyland hong k...
1        sinc last time visit hk disneyland yet time st...
2        thank god hot humid visit park otherwis would ...
3        hk disneyland great compact park unfortun quit...
4        locat citi took around hour kowlon kid like di...
                               ...                        
42651    went disneyland pari juli thought brilliant vi...
42652    adult child visit disneyland pari begin feb ab...
42653    eleven year old daughter went visit son london...
42654    hotel part disneyland pari complex wonder plac...
42655    went disneypari resort small child minut enter...
Name: stemmed, Length: 42656, dtype: object


In [13]:
x= df['stemmed'].values
y = df['Rating'].values

In [14]:
print (x)

['ever disneyland anywher find disneyland hong kong similar layout walk main street familiar feel one ride small world absolut fabul worth day visit fairli hot rel busi queue move fairli well'
 'sinc last time visit hk disneyland yet time stay tomorrowland aka marvel land iron man experi n newli open ant man n wasp ironman great featur n excit especi whole sceneri hk hk central area kowloon antman chang previou buzz lightyear less expect someth howev boy like space mountain turn star war great cast member staff felt bit minu point dun feel like disney brand seem local like ocean park even worst got smile face wanna u enter n attract n leav hello suppos happiest place earth brand realli dont feel bakeri main street attract delicaci n disney theme sweet good point last also starbuck insid theme park'
 'thank god hot humid visit park otherwis would big issu lot shade arriv around left pm unfortun last even parad hour much plenti everyon find someth interest enjoy extrem busi longest time 

In [15]:
print (y)

[4 4 4 ... 5 4 4]


Splitting the data to training data and test data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=2)

In [17]:
print(x.shape, x_train.shape,x_test.shape)

(42656,) (34124,) (8532,)


In [18]:
#convert textual data to numerical data

vectorizer = TfidfVectorizer()
x_train =vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [19]:
print (x_train)

  (0, 8654)	0.10678431495613067
  (0, 9080)	0.07893801542608998
  (0, 7289)	0.16447065025722712
  (0, 24070)	0.10017739035800663
  (0, 1675)	0.1537728441996045
  (0, 23234)	0.09464469766908681
  (0, 20133)	0.11468514638259764
  (0, 21396)	0.13816292856979437
  (0, 2067)	0.1225558665142813
  (0, 23453)	0.1688111293011756
  (0, 12136)	0.2041824747872766
  (0, 12288)	0.10546528655124612
  (0, 1566)	0.16491856795819518
  (0, 18380)	0.18434574113086483
  (0, 11570)	0.19083571526514292
  (0, 16676)	0.25085283190976154
  (0, 23879)	0.14402698770453787
  (0, 13516)	0.20729562394368023
  (0, 21626)	0.14001895535041797
  (0, 23680)	0.1858361794703714
  (0, 13907)	0.38643783061886056
  (0, 9280)	0.35866905920993686
  (0, 21044)	0.18972600649476198
  (0, 5135)	0.14187644493065305
  (0, 21350)	0.12106539851868067
  :	:
  (34123, 7163)	0.10448806963802935
  (34123, 12554)	0.10733924881635372
  (34123, 8511)	0.18522427634819372
  (34123, 23895)	0.11866496283276441
  (34123, 287)	0.13708381142191947
 

In [20]:
print(x_test)

  (0, 21396)	0.09563567886271504
  (0, 14669)	0.2809763002523209
  (0, 14017)	0.28510830317605446
  (0, 13067)	0.250360762291643
  (0, 12539)	0.19675305506377017
  (0, 11861)	0.192440113197033
  (0, 9080)	0.10928098834465554
  (0, 8928)	0.11516684353548579
  (0, 8340)	0.3354691984812789
  (0, 8271)	0.13180505667661171
  (0, 7984)	0.2832996240360636
  (0, 7255)	0.3226393965865864
  (0, 7181)	0.21760390760074638
  (0, 5652)	0.2888035679322954
  (0, 3768)	0.2770425273577704
  (0, 3164)	0.25508735967711427
  (0, 2067)	0.16966509922784015
  (0, 80)	0.22661385217324292
  (1, 23895)	0.21641229116834954
  (1, 21238)	0.2577733770305555
  (1, 19267)	0.2247314122029599
  (1, 19125)	0.3584957575574484
  (1, 17733)	0.1172300582327197
  (1, 16820)	0.20373661332099277
  (1, 16778)	0.33877934403259746
  :	:
  (8530, 10108)	0.13144968685431266
  (8530, 9319)	0.08001658291979714
  (8530, 9141)	0.08791276659373806
  (8530, 5135)	0.06262245373517497
  (8530, 3506)	0.19842230534978028
  (8530, 2675)	0.2657

training the machine learning model 

logistic regresion:classification model

In [21]:
model =LogisticRegression(max_iter=1000)

In [22]:
model.fit(x_train, y_train)

Model evaluation

In [23]:
#accuracy score on the training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train, x_train_prediction)

In [24]:
print('accuracy prdiction:',training_data_accuracy )

accuracy prdiction: 0.7196108310866253


In [25]:
#accuracy score on the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test, x_test_prediction)

In [26]:
print('accuracy prdiction:',test_data_accuracy )

accuracy prdiction: 0.6274027191748711



save the model

In [27]:
import pickle

In [28]:
filename ='sentiment.sav'
pickle.dump(model, open(filename,'wb'))

using the saved model for predictions

In [29]:
#loading saved model
loa_model = pickle.load(open('sendedtiment.sav','rb'))

In [35]:
x_new = "i love this movie"
vectorizer = TfidfVectorizer()
x_train =vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print(y_test[9])

prediction = model.predict(x_new)
print(prediction)

if(prediction[0]==2):
    print('negative')
else:
    print('positive')

AttributeError: lower not found