In [69]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [73]:
df = pd.read_csv(r"C:\Users\Yasir09\BA_Reviews.csv")


In [78]:
df.columns
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,Not Verified | The worst airline I have ever f...,5,13th November 2024,Australia,False
1,"✅ Trip Verified | Disappointing service, pla...",1,8th November 2024,Iceland,True
2,✅ Trip Verified | A transit visa is required...,3,8th November 2024,Singapore,True
3,✅ Trip Verified | I was surprised to receive...,1,7th November 2024,Turkey,True
4,"✅ Trip Verified | Flight cancelled, not refu...",3,7th November 2024,Canada,True


In [75]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [77]:
#df.head()
df['verified']

0       False
1        True
2        True
3        True
4        True
        ...  
3495    False
3496    False
3497    False
3498    False
3499    False
Name: verified, Length: 3500, dtype: bool

### Cleaning Reviews

We will extract the column of reviews into a separate dataframe and clean it for semantic analysis

In [79]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [80]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [87]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | The worst airline I have ever f...,5,13th November 2024,Australia,False,verified worst airline ever flown cancelled fl...
1,"✅ Trip Verified | Disappointing service, pla...",1,8th November 2024,Iceland,True,disappointing service plane interior terribly ...
2,✅ Trip Verified | A transit visa is required...,3,8th November 2024,Singapore,True,transit visa required partner transit via lond...
3,✅ Trip Verified | I was surprised to receive...,1,7th November 2024,Turkey,True,surprised receive confirmation pre flight kg l...
4,"✅ Trip Verified | Flight cancelled, not refu...",3,7th November 2024,Canada,True,flight cancelled refunding money saying took f...


Cleaning/Fromat date

In [88]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [89]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

0      2024-11-13
1      2024-11-08
2      2024-11-08
3      2024-11-07
4      2024-11-07
          ...    
3495   2014-11-12
3496   2014-11-12
3497   2014-11-12
3498   2014-11-11
3499   2014-11-06
Name: date, Length: 3500, dtype: datetime64[ns]

In [92]:
#df.date.head()
df.date.head()

0   2024-11-13
1   2024-11-08
2   2024-11-08
3   2024-11-07
4   2024-11-07
Name: date, dtype: datetime64[ns]

Cleaning ratings with stars

In [93]:
#check for unique values
df.stars.unique()

array(['5', '1', '3', '6', '7', '9', '2', '8', '4', '10', 'None'],
      dtype=object)

In [94]:
# remove the \t and \n from the ratings
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [95]:
df.stars.value_counts()

1       883
2       408
3       401
8       339
10      281
7       272
9       264
5       246
4       233
6       170
None      3
Name: stars, dtype: int64

In [96]:
# drop the rows where the value of ratings is None
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [97]:
#check the unique values again
df.stars.unique()

array(['5', '1', '3', '6', '7', '9', '2', '8', '4', '10'], dtype=object)

Check for null Values

In [98]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3496
                       True     False     False        1
dtype: int64

In [99]:
df.country.isnull().value_counts()

False    3496
True        1
Name: country, dtype: int64

In [100]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [101]:
df.shape

(3496, 6)

In [102]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,Not Verified | The worst airline I have ever f...,5,2024-11-13,Australia,False,verified worst airline ever flown cancelled fl...
1,"✅ Trip Verified | Disappointing service, pla...",1,2024-11-08,Iceland,True,disappointing service plane interior terribly ...
2,✅ Trip Verified | A transit visa is required...,3,2024-11-08,Singapore,True,transit visa required partner transit via lond...
3,✅ Trip Verified | I was surprised to receive...,1,2024-11-07,Turkey,True,surprised receive confirmation pre flight kg l...
4,"✅ Trip Verified | Flight cancelled, not refu...",3,2024-11-07,Canada,True,flight cancelled refunding money saying took f...
...,...,...,...,...,...,...
3491,BA 83 from LHR to LAX first time on Airbus A38...,9,2014-11-12,United States,False,ba lhr lax first time airbus comfortable quiet...
3492,London Gatwick - Barbados in premium economy. ...,1,2014-11-12,Spain,False,london gatwick barbados premium economy saturd...
3493,LHR-ZRH. A320 was used on this route. I was no...,10,2014-11-12,United States,False,lhr zrh used route expecting much since flight...
3494,I flew from MIA-LHR-DXB. The 747 like most of ...,7,2014-11-11,United Arab Emirates,False,flew mia lhr dxb like crew well passed sell da...


Now our data is all cleaned and ready for data visualization and data analysis

In [103]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")