In [31]:
# Imports 
import re 
import string
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns 
import numpy as np
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
# load data 
reviews = pd.read_csv("data/BA_Reviews.csv")
# show a sample of reviews 
reviews.head()

Unnamed: 0,Review
0,Not Verified | I was meant to fly in January t...
1,✅ Trip Verified | We have flown repeatedly wi...
2,✅ Trip Verified | I was horrified by the extr...
3,✅ Trip Verified | \r\nThe worst cabin experie...
4,✅ Trip Verified | First time flying with Briti...


In [4]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3542 entries, 0 to 3541
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  3542 non-null   object
dtypes: object(1)
memory usage: 27.8+ KB


### Text Cleaninf and Preprocessing

The reviews texts at hand are very messy and before we progress with any analysis, we have to clean the texts from punctuations, digits, tags, http link, special characters, emojis, etc. After that we need to perform some prprocessing tasks such as tokenization, stopwords removal, stemming, etc. 

In [66]:
# define a function to clean the text 
def clean(text):
    if '|' in text:
        text = text.split('|')[1]
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[^\w\s]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
def preprocess(text): 
    #get english stops words 
    stop_words = set(stopwords.words("english")) 
    lemmatizer = WordNetLemmatizer()
    #tokenize text 
    word_tokens = word_tokenize(text) 
    #remove stop words 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    #stemming the text 
    stemmed_text=[lemmatizer.lemmatize(word) for word in filtered_text]
    # join text again
    final_text = " ".join(stemmed_text).strip()
    return final_text

In [67]:
# copy the dataframe and launch cleaning and preprocessing tasks 
df = reviews.copy()
df['Preprocessed_review'] = df['Review'].apply(clean).apply(preprocess)
df.head()

Unnamed: 0,Review,Preprocessed_review
0,Not Verified | I was meant to fly in January t...,meant fly january algeria paid ticket day mean...
1,✅ Trip Verified | We have flown repeatedly wi...,flown repeatedly british airway one world alli...
2,✅ Trip Verified | I was horrified by the extr...,horrified extremely small seat poor training c...
3,✅ Trip Verified | \r\nThe worst cabin experie...,worst cabin experience ever cramped seat low c...
4,✅ Trip Verified | First time flying with Briti...,first time flying british airway first time fl...


In [68]:
# compare text and cleaned text 
from random import randrange
n = randrange(len(df))
print(" Review:\n", df.Review[n])
print('\n')
print(" Preprocessed Review:\n", df.Preprocessed_review[n])

 Review:
 ✅ Trip Verified |  Prague to Denver via London. I almost missed my flight because at check-in I had to complete some really important papers that took 40 minutes -  later, no one asked me for those papers. Flight to Heathrow was average, what you expect from a 2 hour flight. At Heathrow the security lasted me  30 minutes. When I boarded the next the plane a lack of leg room, it was less than the flight before.  The flight then was pretty much okay, the food was delicious, had pasta with tomato sauce and several times drinks, the flight staff was kind and helpful. But the plane had no paid Wi-Fi, no outlets or USB ports, the screens were low quality.


 Preprocessed Review:
 prague denver via london almost missed flight checkin complete really important paper took minute later one asked paper flight heathrow average expect hour flight heathrow security lasted minute boarded next plane lack leg room le flight flight pretty much okay food delicious pasta tomato sauce several tim

### Sentiment Analysis 

In [69]:
# define a function 
def analyzer(text):
    sia = SentimentIntensityAnalyzer()
    if sia.polarity_scores(text).compound>0:
        return 'positive'
    if sia.polarity_scores(text).compound<0:
        return 'negative'
    else:
        return 'netural'
    


# Text analysis example
example = 'I\'m having a wonderful time'
print(example)
print(sia.polarity_scores(example))

I'm having a wonderful time
{'neg': 0.0, 'neu': 0.448, 'pos': 0.552, 'compound': 0.5719}
