This jupyter source notebook will be about the cleaning the data that we scraped in the file "Ba-Reviews".

In [72]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [73]:
df = pd.read_csv("D:\BA-FEEDBACKLENS\BA_reviews.csv", index_col=0)

  df = pd.read_csv("D:\BA-FEEDBACKLENS\BA_reviews.csv", index_col=0)


In [74]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | LHR T5 BA Gold Wing worked wel...,5.0,3rd April 2025,United Kingdom
1,Not Verified | Very good service on this rout...,9.0,31st March 2025,United Kingdom
2,✅ Trip Verified | Flight mainly let down by ...,10.0,19th March 2025,United Kingdom
3,✅ Trip Verified | Another awful experience b...,7.0,16th March 2025,United States
4,"✅ Trip Verified | The service was rude, full...",1.0,16th March 2025,United States


In [75]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [76]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,Not Verified | LHR T5 BA Gold Wing worked wel...,5.0,3rd April 2025,United Kingdom,False
1,Not Verified | Very good service on this rout...,9.0,31st March 2025,United Kingdom,False
2,✅ Trip Verified | Flight mainly let down by ...,10.0,19th March 2025,United Kingdom,True
3,✅ Trip Verified | Another awful experience b...,7.0,16th March 2025,United States,True
4,"✅ Trip Verified | The service was rude, full...",1.0,16th March 2025,United States,True


<h1>CLEANING THE REVIEWS

In [78]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download NLTK resources (only needed once)
nltk.download('stopwords')
nltk.download('wordnet')

# Replace NaN with empty string
df['reviews'] = df['reviews'].fillna("")

# Remove the prefix text
df['clean_reviews'] = df['reviews'].str.replace("✅ Trip Verified |", "", regex=False)

# Initialize lemmatizer and stopwords
lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Generate corpus
corpus = []
for rev in df['clean_reviews']:
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in stop_words]
    rev = " ".join(rev)
    corpus.append(rev)

# Add corpus to the DataFrame
df['corpus'] = corpus


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ragep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ragep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [79]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,clean_reviews,corpus
0,Not Verified | LHR T5 BA Gold Wing worked wel...,5.0,3rd April 2025,United Kingdom,False,Not Verified | LHR T5 BA Gold Wing worked wel...,verified lhr ba gold wing worked well pleasant...
1,Not Verified | Very good service on this rout...,9.0,31st March 2025,United Kingdom,False,Not Verified | Very good service on this rout...,verified good service route ba th march cabin ...
2,✅ Trip Verified | Flight mainly let down by ...,10.0,19th March 2025,United Kingdom,True,Flight mainly let down by a disagreeable fl...,flight mainly let disagreeable flight attendan...
3,✅ Trip Verified | Another awful experience b...,7.0,16th March 2025,United States,True,Another awful experience by British Airways...,another awful experience british airway flight...
4,"✅ Trip Verified | The service was rude, full...",1.0,16th March 2025,United States,True,"The service was rude, full of attitude to m...",service rude full attitude food poorly service...


<h2> Formatting the data

In [81]:
df.dtypes

reviews           object
stars            float64
date              object
country           object
verified            bool
clean_reviews     object
corpus            object
dtype: object

In [82]:
df['date'] #Currently as object datatype

0           3rd April 2025
1          31st March 2025
2          19th March 2025
3          16th March 2025
4          16th March 2025
               ...        
3495    25th November 2014
3496    25th November 2014
3497    25th November 2014
3498    25th November 2014
3499    25th November 2014
Name: date, Length: 3500, dtype: object

In [83]:
#Converting the given date into date-time format
def clean_date(date_str):
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

# Clean the date strings
df['date'] = df['date'].apply(clean_date)

# Convert to datetime
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')

In [99]:
df['date'].head()

0   2025-04-03
1   2025-03-31
2   2025-03-19
3   2025-03-16
4   2025-03-16
Name: date, dtype: datetime64[ns]

<h2>CLEANING THE RATINGS WITH UNIQUE STARS COUNTS

In [106]:
#Checking for possible unique star values
df.stars.unique()

array([ 5.,  9., 10.,  7.,  1.,  3.,  4.,  2.,  8.,  6., nan])

In [108]:
df.stars.value_counts()

stars
1.0     896
2.0     406
3.0     401
8.0     335
10.0    275
7.0     273
9.0     266
5.0     245
4.0     234
6.0     168
Name: count, dtype: int64

<h2>CHECKING FOR NULL VALUES

In [115]:
df.isnull().value_counts()

reviews  stars  date   country  verified  clean_reviews  corpus
False    False  False  False    False     False          False     3498
                       True     False     False          False        1
         True   False  False    False     False          False        1
Name: count, dtype: int64

In [121]:
df.stars.isnull().value_counts()

stars
False    3498
True        1
Name: count, dtype: int64

In [125]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)
#drop the rows using index where the stars value is null
df.drop(df[df.stars.isnull() == True].index, axis=0, inplace=True)

In [127]:
df.isnull().value_counts()

reviews  stars  date   country  verified  clean_reviews  corpus
False    False  False  False    False     False          False     3498
Name: count, dtype: int64

In [129]:
df

Unnamed: 0,reviews,stars,date,country,verified,clean_reviews,corpus
0,Not Verified | LHR T5 BA Gold Wing worked wel...,5.0,2025-04-03,United Kingdom,False,Not Verified | LHR T5 BA Gold Wing worked wel...,verified lhr ba gold wing worked well pleasant...
1,Not Verified | Very good service on this rout...,9.0,2025-03-31,United Kingdom,False,Not Verified | Very good service on this rout...,verified good service route ba th march cabin ...
2,✅ Trip Verified | Flight mainly let down by ...,10.0,2025-03-19,United Kingdom,True,Flight mainly let down by a disagreeable fl...,flight mainly let disagreeable flight attendan...
3,✅ Trip Verified | Another awful experience b...,7.0,2025-03-16,United States,True,Another awful experience by British Airways...,another awful experience british airway flight...
4,"✅ Trip Verified | The service was rude, full...",1.0,2025-03-16,United States,True,"The service was rude, full of attitude to m...",service rude full attitude food poorly service...
...,...,...,...,...,...,...,...
3495,There are many negative reviews on this site -...,3.0,2014-11-25,United Kingdom,False,There are many negative reviews on this site -...,many negative review site almost embarrassing ...
3496,"British Airways from Gatwick to Catania, Sicil...",6.0,2014-11-25,Thailand,False,"British Airways from Gatwick to Catania, Sicil...",british airway gatwick catania sicily return p...
3497,Outbound flight with British Airways from Gatw...,4.0,2014-11-25,United States,False,Outbound flight with British Airways from Gatw...,outbound flight british airway gatwick orlando...
3498,✅ Verified Review | Family of 4 adults flying...,10.0,2014-11-25,United States,False,✅ Verified Review | Family of 4 adults flying...,verified review family adult flying british ai...


In [131]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,clean_reviews,corpus
0,Not Verified | LHR T5 BA Gold Wing worked wel...,5.0,2025-04-03,United Kingdom,False,Not Verified | LHR T5 BA Gold Wing worked wel...,verified lhr ba gold wing worked well pleasant...
1,Not Verified | Very good service on this rout...,9.0,2025-03-31,United Kingdom,False,Not Verified | Very good service on this rout...,verified good service route ba th march cabin ...
2,✅ Trip Verified | Flight mainly let down by ...,10.0,2025-03-19,United Kingdom,True,Flight mainly let down by a disagreeable fl...,flight mainly let disagreeable flight attendan...
3,✅ Trip Verified | Another awful experience b...,7.0,2025-03-16,United States,True,Another awful experience by British Airways...,another awful experience british airway flight...
4,"✅ Trip Verified | The service was rude, full...",1.0,2025-03-16,United States,True,"The service was rude, full of attitude to m...",service rude full attitude food poorly service...
...,...,...,...,...,...,...,...
3493,There are many negative reviews on this site -...,3.0,2014-11-25,United Kingdom,False,There are many negative reviews on this site -...,many negative review site almost embarrassing ...
3494,"British Airways from Gatwick to Catania, Sicil...",6.0,2014-11-25,Thailand,False,"British Airways from Gatwick to Catania, Sicil...",british airway gatwick catania sicily return p...
3495,Outbound flight with British Airways from Gatw...,4.0,2014-11-25,United States,False,Outbound flight with British Airways from Gatw...,outbound flight british airway gatwick orlando...
3496,✅ Verified Review | Family of 4 adults flying...,10.0,2014-11-25,United States,False,✅ Verified Review | Family of 4 adults flying...,verified review family adult flying british ai...


In [139]:
df.to_csv("D:/BA-FEEDBACKLENS/CLEANED.csv")