In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


import re

In [2]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | Worst experience ever. Outbound...,5.0,7th November 2022,Italy
1,✅ Trip Verified | Check in was a shambles at ...,1.0,7th November 2022,Malaysia
2,✅ Trip Verified | Beyond disgusted with the fa...,5.0,5th November 2022,United Arab Emirates
3,✅ Trip Verified | On July 19th 2022 I had subm...,1.0,31st October 2022,United States
4,✅ Trip Verified | I booked the flight on Oct ...,1.0,31st October 2022,United States


In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [5]:
df['verified']

0       False
1        True
2        True
3        True
4        True
        ...  
3413    False
3414    False
3415    False
3416    False
3417    False
Name: verified, Length: 3418, dtype: bool

In [6]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()



reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [7]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [11]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Worst experience ever. Outbound...,5.0,7th November 2022,Italy,False,verified worst experience ever outbound flight...
1,✅ Trip Verified | Check in was a shambles at ...,1.0,7th November 2022,Malaysia,True,check shamble bwi counter open full flight bag...
2,✅ Trip Verified | Beyond disgusted with the fa...,5.0,5th November 2022,United Arab Emirates,True,beyond disgusted fact baggage yet delivered we...
3,✅ Trip Verified | On July 19th 2022 I had subm...,1.0,31st October 2022,United States,True,july th submitted complaint form regard fact b...
4,✅ Trip Verified | I booked the flight on Oct ...,1.0,31st October 2022,United States,True,booked flight oct cancel flight day learning g...


In [12]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [14]:
# Function to remove ordinal suffixes
def remove_ordinal_suffix(date_str):
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

# Apply the function to the 'date' column
df['date'] = df['date'].apply(remove_ordinal_suffix)

# Now convert to datetime
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')

# Check the result
print(df)

                                                reviews  stars       date  \
0     Not Verified | Worst experience ever. Outbound...    5.0 2022-11-07   
1     ✅ Trip Verified |  Check in was a shambles at ...    1.0 2022-11-07   
2     ✅ Trip Verified | Beyond disgusted with the fa...    5.0 2022-11-05   
3     ✅ Trip Verified | On July 19th 2022 I had subm...    1.0 2022-10-31   
4     ✅ Trip Verified |  I booked the flight on Oct ...    1.0 2022-10-31   
...                                                 ...    ...        ...   
3413  This was a bmi Regional operated flight on a R...    1.0 2012-08-29   
3414  LHR to HAM. Purser addresses all club passenge...   10.0 2012-08-28   
3415  My son who had worked for British Airways urge...   10.0 2011-10-12   
3416  London City-New York JFK via Shannon on A318 b...    8.0 2011-10-11   
3417  SIN-LHR BA12 B747-436 First Class. Old aircraf...    9.0 2011-10-09   

                   country  verified  \
0                    Italy     Fals

In [15]:
df.date.head()

0   2022-11-07
1   2022-11-07
2   2022-11-05
3   2022-10-31
4   2022-10-31
Name: date, dtype: datetime64[ns]

In [16]:
#check for unique values
df.stars.unique()

array([ 5.,  1.,  6.,  9.,  3.,  2.,  8.,  7., 10.,  4., nan])

In [18]:
df['stars'] = df['stars'].fillna('').astype(str).str.strip("\n\t")

In [19]:
df.stars.value_counts()

stars
1.0     735
2.0     382
3.0     379
8.0     349
10.0    306
7.0     300
9.0     294
5.0     259
4.0     227
6.0     182
          5
Name: count, dtype: int64

In [20]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [21]:
#check the unique values again
df.stars.unique()

array(['5.0', '1.0', '6.0', '9.0', '3.0', '2.0', '8.0', '7.0', '10.0',
       '4.0', ''], dtype=object)

In [22]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3416
                       True     False     False        2
Name: count, dtype: int64

In [23]:
df.country.isnull().value_counts()

country
False    3416
True        2
Name: count, dtype: int64

In [24]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [25]:
df.shape

(3416, 6)

In [26]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | Worst experience ever. Outbound...,5.0,2022-11-07,Italy,False,verified worst experience ever outbound flight...
1,✅ Trip Verified | Check in was a shambles at ...,1.0,2022-11-07,Malaysia,True,check shamble bwi counter open full flight bag...
2,✅ Trip Verified | Beyond disgusted with the fa...,5.0,2022-11-05,United Arab Emirates,True,beyond disgusted fact baggage yet delivered we...
3,✅ Trip Verified | On July 19th 2022 I had subm...,1.0,2022-10-31,United States,True,july th submitted complaint form regard fact b...
4,✅ Trip Verified | I booked the flight on Oct ...,1.0,2022-10-31,United States,True,booked flight oct cancel flight day learning g...
...,...,...,...,...,...,...
3411,This was a bmi Regional operated flight on a R...,1.0,2012-08-29,United Kingdom,False,bmi regional operated flight rj manchester hea...
3412,LHR to HAM. Purser addresses all club passenge...,10.0,2012-08-28,United Kingdom,False,lhr ham purser address club passenger name boa...
3413,My son who had worked for British Airways urge...,10.0,2011-10-12,United Kingdom,False,son worked british airway urged fly british ai...
3414,London City-New York JFK via Shannon on A318 b...,8.0,2011-10-11,United States,False,london city new york jfk via shannon really ni...


*****

In [117]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")