In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [2]:
cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | I booked two business class sea...,5.0,27th August 2024,United States
1,Not Verified | Although all four flights were ...,1.0,26th August 2024,United Kingdom
2,✅ Trip Verified | Flight changed just one da...,9.0,26th August 2024,United Kingdom
3,Not Verified | Delayed flight 1 hour. Arbitrar...,1.0,25th August 2024,United Kingdom
4,Not Verified | Flown with BA four times. As an...,2.0,25th August 2024,United Kingdom


In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [5]:
df['verified']

0        False
1        False
2         True
3        False
4        False
         ...  
14246    False
14247    False
14248    False
14249    False
14250    False
Name: verified, Length: 14251, dtype: bool

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [7]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jaini\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaini\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jaini\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [9]:
df['corpus'] = corpus

In [10]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | I booked two business class sea...,5.0,27th August 2024,United States,False,verified booked two business class seat medica...
1,Not Verified | Although all four flights were ...,1.0,26th August 2024,United Kingdom,False,verified although four flight delayed service ...
2,✅ Trip Verified | Flight changed just one da...,9.0,26th August 2024,United Kingdom,True,flight changed one day british airway titan fl...
3,Not Verified | Delayed flight 1 hour. Arbitrar...,1.0,25th August 2024,United Kingdom,False,verified delayed flight hour arbitrary cancell...
4,Not Verified | Flown with BA four times. As an...,2.0,25th August 2024,United Kingdom,False,verified flown ba four time anxious traveller ...


In [11]:
df.dtypes

reviews      object
stars       float64
date         object
country      object
verified       bool
corpus       object
dtype: object

In [14]:
invalid_dates = df[~pd.to_datetime(df['date'], errors='coerce').notna()]
print(invalid_dates)


                                                 reviews  stars  \
5      ✅ Trip Verified |   You may never see your ref...   10.0   
6      Not Verified | Cargo service: that's how I fel...    1.0   
7      ✅ Trip Verified | Inefficient and slow airport...    5.0   
19     ✅ Trip Verified | BA, after subsequent delays ...    1.0   
20     Not Verified |   It is embarrassing to have th...    1.0   
...                                                  ...    ...   
14246  I recently flew on British Airways long haul o...    4.0   
14247  We have just flown economy class London to Los...    3.0   
14248  October 16th Ibiza to London Heathrow. Well a ...    8.0   
14249  LHR-IAD-LHR 29/9 and return 13/10. We booked o...    3.0   
14250  OSL-LHR in Business. BA has over the last few ...    6.0   

                    date         country  verified  \
5       22nd August 2024   United States      True   
6       21st August 2024   United States     False   
7       21st August 2024  United 

In [15]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])


In [16]:
df['date'] = pd.to_datetime(df['date'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))


In [17]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')  # Adjust the format as necessary


In [18]:
df.date = pd.to_datetime(df.date)

In [19]:
df.date.head()

0   2024-08-27
1   2024-08-26
2   2024-08-26
3   2024-08-25
4   2024-08-25
Name: date, dtype: datetime64[ns]

In [20]:
df.stars.unique()

array([ 5.,  1.,  9.,  2.,  4.,  8.,  3., 10.,  7.,  6., nan])

In [21]:
df.stars.value_counts()

stars
1.0     2775
2.0     1301
3.0     1287
8.0     1112
10.0     940
7.0      903
9.0      856
5.0      789
4.0      738
6.0      580
Name: count, dtype: int64

In [23]:
df.stars.unique()

array([ 5.,  1.,  9.,  2.,  4.,  8.,  3., 10.,  7.,  6., nan])

In [24]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     11276
         True   False  False    False     False         8
         False  False  True     False     False         5
Name: count, dtype: int64

In [25]:
df.country.isnull().value_counts()

country
False    11284
True         5
Name: count, dtype: int64

In [26]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [27]:
df.shape

(11284, 6)

In [28]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | I booked two business class sea...,5.0,2024-08-27,United States,False,verified booked two business class seat medica...
1,Not Verified | Although all four flights were ...,1.0,2024-08-26,United Kingdom,False,verified although four flight delayed service ...
2,✅ Trip Verified | Flight changed just one da...,9.0,2024-08-26,United Kingdom,True,flight changed one day british airway titan fl...
3,Not Verified | Delayed flight 1 hour. Arbitrar...,1.0,2024-08-25,United Kingdom,False,verified delayed flight hour arbitrary cancell...
4,Not Verified | Flown with BA four times. As an...,2.0,2024-08-25,United Kingdom,False,verified flown ba four time anxious traveller ...
...,...,...,...,...,...,...
11279,Flew from London to Calgary. The seats in the ...,5.0,2014-10-28,Canada,False,flew london calgary seat new dreamliner awful ...
11280,BA285 LHR-SFO seat 13a in WT+ on a pretty old ...,3.0,2014-10-28,United Kingdom,False,ba lhr sfo seat wt pretty old check security q...
11281,Flight 103 on October 14. We flew from London ...,4.0,2014-10-28,Canada,False,flight october flew london calgary alberta lon...
11282,Traveled with British Airways ORD-LHR last mon...,3.0,2014-10-28,United States,False,aveled british airway ord lhr last month world...


In [29]:
df.to_csv(cwd + "/cleaned-BA-reviews.csv")