In [1]:
import pandas as pd
import nltk

import warnings
warnings.filterwarnings('ignore') # We can suppress the warnings

df = pd.read_csv('Sentiment_Analysis_Ryanair_Reviews.csv')

df.head()

Unnamed: 0,Number,Rating,Review
0,1,Good,"Flight was on time, aircraft clean, very good ..."
1,2,Good,Great value flight at the right time from the ...
2,3,Good,We fly with this airline regularly and couldn’...
3,4,Good,"I recently flew with Ryanair, and I was impres..."
4,5,Good,"The cabin crew was professional and friendly, ..."


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Number  300 non-null    int64 
 1   Rating  300 non-null    object
 2   Review  300 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.2+ KB


In [3]:
df.groupby('Rating').describe()

Unnamed: 0_level_0,Number,Number,Number,Number,Number,Number,Number,Number
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bad,100.0,250.5,29.011492,201.0,225.75,250.5,275.25,300.0
Good,100.0,50.5,29.011492,1.0,25.75,50.5,75.25,100.0
Neutral,100.0,150.5,29.011492,101.0,125.75,150.5,175.25,200.0


In [4]:
# Store the column of the dataframe named as "text"
X = df['Review']

# Display the value "X"
print(X)

0      Flight was on time, aircraft clean, very good ...
1      Great value flight at the right time from the ...
2      We fly with this airline regularly and couldn’...
3      I recently flew with Ryanair, and I was impres...
4      The cabin crew was professional and friendly, ...
                             ...                        
295    Another late departure, why do you recommend m...
296    This airline is cheap, but they will find ways...
297    Don't have priority by right for pregnants is ...
298    Paid for a 10kg cabin bag online arrived at ga...
299    Appalling...this company should go bankrupt. T...
Name: Review, Length: 300, dtype: object


In [5]:
# Store the column if the dataframe named as  "airline_sentiment"
y = df['Rating']

# Display the column of the dataframe named as "airline_sentiment"
print(y)

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
295     Bad
296     Bad
297     Bad
298     Bad
299     Bad
Name: Rating, Length: 300, dtype: object


# Cleaning Reviews

In [6]:
df.groupby('Rating').describe()

Unnamed: 0_level_0,Number,Number,Number,Number,Number,Number,Number,Number
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bad,100.0,250.5,29.011492,201.0,225.75,250.5,275.25,300.0
Good,100.0,50.5,29.011492,1.0,25.75,50.5,75.25,100.0
Neutral,100.0,150.5,29.011492,101.0,125.75,150.5,175.25,200.0


In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Rating'] = label_encoder.fit_transform(df['Rating'])

In [8]:
df.groupby('Rating').describe()

Unnamed: 0_level_0,Number,Number,Number,Number,Number,Number,Number,Number
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,100.0,250.5,29.011492,201.0,225.75,250.5,275.25,300.0
1,100.0,50.5,29.011492,1.0,25.75,50.5,75.25,100.0
2,100.0,150.5,29.011492,101.0,125.75,150.5,175.25,200.0


In [9]:
from nltk.corpus import stopwords
nltk.download('stopwords')

import string
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jose\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Store the stopwords into the object named as "stop_words"
stop_words = stopwords.words('english')

# Store the string.punctuation into an object punct
punct = string.punctuation

# Initialise an object using a method PorterStemmer
stemmer = PorterStemmer()

In [11]:
import re

cleaned_data=[]

# For loop from first value to length(X), ^a-zA-Z means include small and capital case letters

for i in range(len(X)):
    review = re.sub('[^a-zA-Z]', ' ', X.iloc[i])
    review = review.lower().split()
    review = [stemmer.stem(word) for word in review if (word not in stop_words) and (word not in punct)]
    review = ' '.join(review)
    cleaned_data.append(review)

In [12]:
cleaned_data

['flight time aircraft clean good crew good leg room hour flight smooth good inform cockpit cabin crew paid return suck pay window seat front commun ryanair travel classic ancillari revenu email work fare permit one cabin bag one check bag look cost ba ei save massiv honest hour care food drink buy well done ryanair def use impress',
 'great valu flight right time local airport staff alway friendli help suit need noth wrong',
 'fli airlin regularli fault staff courteou friendli flight usual time never issu get compens coupl delay flight recent flight madeira arriv time felt safe skill captain madeira airport tricki howev land smooth',
 'recent flew ryanair impress overal experi airlin offer frill cost effect option travel without compromis effici book process straightforward commit punctual evid throughout journey',
 'cabin crew profession friendli make flight enjoy despit afford price seat comfort enough durat flight airlin dedic keep cost low reflect transpar price model allow passen

In [13]:
print(y)

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
295     Bad
296     Bad
297     Bad
298     Bad
299     Bad
Name: Rating, Length: 300, dtype: object


In [14]:
# Collect all columns into dataframe named as sentiment_ordering
sentiment_ordering = ['Bad', 'Good', 'Neutral']

# store all values into column named as "y"
y = y.apply(lambda x: sentiment_ordering.index(x))

In [15]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Rating, dtype: int64

# Bag of Words using CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate an object cv by calling a method named as CountVectorzer()
cv    = CountVectorizer( stop_words = ['ryanair', 'flight'])
# max_features = 3000,

# Train the dataset by calling a fit_transform() method
X_fin = cv.fit_transform(cleaned_data).toarray()

# Display the rows and colums
X_fin.shape

(300, 1323)

In [39]:
print(cv)

CountVectorizer(stop_words=['ryanair', 'flight'])


In [38]:
X_fin

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Instantiate an object model by calling a method MultinomialNB()
model = MultinomialNB()

In [18]:
# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_fin, y, test_size = 0.3, random_state= 42)

In [19]:
# Train the model by calling a method fit()
model.fit(X_train,y_train)

In [20]:
# Call predict() method
y_pred = model.predict(X_test)

In [21]:
from sklearn.metrics import classification_report

# Instantiate a mthod named as Cla
cf = classification_report(y_test, y_pred)

# Display the values of an object cf
print(cf)

              precision    recall  f1-score   support

           0       0.77      0.63      0.69        27
           1       0.56      0.66      0.61        35
           2       0.41      0.39      0.40        28

    accuracy                           0.57        90
   macro avg       0.58      0.56      0.57        90
weighted avg       0.58      0.57      0.57        90



In [22]:
review = 'Good experience flying with them I recommend great friendly.'
model.predict_proba(cv.transform([review]))[0]

array([0.0011935 , 0.95580088, 0.04300562])

In [23]:
review = 'Excellent experience flying with them I recommend great friendly.'
model.predict_proba(cv.transform([review]))[0]

array([0.00809985, 0.93435613, 0.05754403])

In [24]:
review = 'poor experience with them I do not.'
model.predict_proba(cv.transform([review]))[0]

array([0.44488663, 0.11000383, 0.44510954])

In [25]:
review = 'bad experience avoid at all costs.'
model.predict_proba(cv.transform([review]))[0]

array([0.46653692, 0.19525542, 0.33820766])

In [26]:
review = 'What you pay is what you get low cost airline'
model.predict_proba(cv.transform([review]))[0]

array([0.27818698, 0.13863413, 0.5831789 ])

In [27]:
review = 'terrible experience flying with them I do not recommend.'
model.predict_proba(cv.transform([review]))[0][2]

0.1121153517628804

In [28]:
review = 'terrible and bad experience travelling with ryanair.'
model.predict_proba(cv.transform([review]))[0]

array([0.25086324, 0.33082187, 0.41831489])

In [29]:
review = 'ryanair is a great company to fly with, great value for your money'
model.predict_proba(cv.transform([review]))[0]

array([0.00186711, 0.92278187, 0.07535102])

# Feature Generation using TF-IDF

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an object 'tf' by calling a method TfidfVectorizer()
tfidf = TfidfVectorizer(max_features = 3000, stop_words = ['ryanair', 'flight']) 

# Train the dataset by calling a method fit_tranform() 
X_tfidf = tfidf.fit_transform(cleaned_data).toarray()

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Instantiate an object model by calling a method MultinomialNB()
model_tdidf = MultinomialNB()

In [32]:
# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.3)

In [33]:
# Train the model by calling a method fit()
model_tdidf.fit(X_train, y_train)

In [34]:
# Call predict() method
y_pred = model_tdidf.predict(X_test)

In [35]:
from sklearn.metrics import classification_report

# Instantiate a mthod named as Cla
cf = classification_report(y_test, y_pred)

# Display the values of an object cf
print(cf)

              precision    recall  f1-score   support

           0       0.76      0.47      0.58        34
           1       0.45      0.67      0.54        27
           2       0.38      0.38      0.38        29

    accuracy                           0.50        90
   macro avg       0.53      0.51      0.50        90
weighted avg       0.55      0.50      0.50        90



In [36]:
review = 'terrible and bad experience travelling with ryanair.'
model.predict_proba(cv.transform([review]))[0]

array([0.25086324, 0.33082187, 0.41831489])