In [1]:
# read data
import pandas as pd


In [4]:
df=pd.read_csv('DisneylandReviews.csv',encoding='latin1')

In [5]:
df.shape

(42656, 6)

In [6]:
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [7]:
df.isna()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
42651,False,False,False,False,False,False
42652,False,False,False,False,False,False
42653,False,False,False,False,False,False
42654,False,False,False,False,False,False


In [8]:
df.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of        Review_ID  Rating  Year_Month  Reviewer_Location  Review_Text  Branch
0          False   False       False              False        False   False
1          False   False       False              False        False   False
2          False   False       False              False        False   False
3          False   False       False              False        False   False
4          False   False       False              False        False   False
...          ...     ...         ...                ...          ...     ...
42651      False   False       False              False        False   False
42652      False   False       False              False        False   False
42653      False   False       False              False        False   False
42654      False   False       False              False        False   False
42655      False   False       False              False        False   False

[42656 rows x

# now we clean the data

In [9]:
import string

In [10]:
punc=string.punctuation

In [11]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
from spacy.lang.en.stop_words import STOP_WORDS
# list of stop words
stopwords=list(STOP_WORDS)

In [13]:
stopwords

['might',
 'thereupon',
 'everything',
 'less',
 'anyone',
 '‘re',
 'though',
 'to',
 'either',
 'because',
 'others',
 'often',
 'should',
 'here',
 'otherwise',
 'within',
 'namely',
 'yours',
 'being',
 'never',
 'anything',
 'someone',
 'more',
 'nine',
 'nor',
 'may',
 'just',
 'one',
 'third',
 'ours',
 'my',
 'else',
 'their',
 'seems',
 'itself',
 'this',
 'whether',
 'put',
 'into',
 'cannot',
 'be',
 'onto',
 'been',
 'get',
 'without',
 'due',
 'same',
 'yourself',
 'down',
 'front',
 'whither',
 'other',
 'afterwards',
 'eight',
 'all',
 '’s',
 "'m",
 'next',
 'that',
 'last',
 'latter',
 'so',
 'towards',
 'mostly',
 '‘ve',
 'but',
 'those',
 'am',
 'using',
 'what',
 'if',
 'hundred',
 'therein',
 'whom',
 'hereafter',
 'between',
 'besides',
 'it',
 'themselves',
 'there',
 'seem',
 'fifty',
 'whence',
 'two',
 "'s",
 'toward',
 "'ll",
 'the',
 'give',
 'he',
 "'re",
 'whereby',
 'myself',
 'or',
 'whole',
 'about',
 'could',
 'nothing',
 'hereby',
 'go',
 'anyway',
 'no

In [14]:
# creating function for data cleaning
import spacy
nlp = spacy.load(name = 'en_core_web_sm')
def text_data_cleaning(sentence):
  doc=nlp(sentence)

  tokens=[]
  for token in doc:
    # most non-pronoun words benefit from lemmatization. Converting the word to its lemma form, making it lowercase, and stripping any extra spaces helps in standardizing the text data.
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
          # Pronouns, however, are already fairly standardized and converting them to their lemma form might not be necessary or might even be undesirable depending on the context.
            temp = token.lower_
        tokens.append(temp)

  cleaned_tokens=[]
  for token in tokens:
    if token not in stopwords and token not in punc:
      cleaned_tokens.append(token)

  return cleaned_tokens  # Return statement outside the loops

In [15]:
text_data_cleaning("Hello all, It's a beautiful day!")

['hello', 'beautiful', 'day']

## **Vectorisation feature engineering (TF-IDF)**
# New Section
 Frequency-Inverse Document Frequency (TF-IDF), are important in natural language processing (NLP) and machine learning tasks involving textual data. TF-IDF is a numerical statistic that reflects the importance of a word in a document relative to a collection of documents

In [16]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [17]:
# to convert all our data to numbers, but before we convert them to number we must clean them first by text_data_cleaning function
tfidf=TfidfVectorizer(tokenizer=text_data_cleaning)

In [18]:
# LinearSVC is a type of Support Vector Machine (SVM) classifier specifically designed for linear classification tasks
classifier=LinearSVC()

# Train data

In [20]:
x=df['Review_Text']
y=df['Rating']

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [22]:
x_train.shape,x_test.shape

((34124,), (8532,))

In [23]:
x_train.head

<bound method NDFrame.head of 25840    This was my third trip to Disneyland as a pare...
35528    Pirates of the Caribbean ride broke down and w...
10913    We travelled with people ranging in ages from ...
8268     With all the hype that surrounds Disney, I exp...
31902    Purchasing the tickets for this attraction was...
                               ...                        
32103    What an amazing family holiday to remember for...
30403    I visited Disney park with my grandson and had...
21243    We've been to Disney World twice so we decided...
42613    I was very disappointed with Disneyland Paris....
2732     Pros:    Easily accessible via MTR    Plenty o...
Name: Review_Text, Length: 34124, dtype: object>

In [24]:
# pipline object first will do vactorization then classification
clf=Pipeline([('tfidf',tfidf),('clf',classifier)])

In [25]:
clf.fit(x_train,y_train)



# Predict

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
y_predict=clf.predict(x_test)

In [28]:
accuracy_score(y_test,y_predict)

0.5985700890764182

In [29]:
clf.predict(['This place is great ! '])

array([5])

In [32]:
clf.predict(['bad park '])

array([1])

In [36]:
clf.predict(['my rating is zero'])

array([1])