Data Cleaning:-


Data cleaning means fixing bad data in your data set.
Bad data could be:-
Empty cells,
Data in wrong format,
Wrong data,
Duplicates

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [3]:
# read data
df = pd.read_csv('British_Airway_Analysis.csv')

In [43]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,created_at,country
0,0,Not Verified | Although all four flights were ...,5,26th August 2024,United Kingdom
1,1,✅ Trip Verified | Flight changed just one da...,9,26th August 2024,United Kingdom
2,2,Not Verified | Delayed flight 1 hour. Arbitrar...,1,25th August 2024,United Kingdom
3,3,Not Verified | Flown with BA four times. As an...,2,25th August 2024,United Kingdom
4,4,✅ Trip Verified | You may never see your ref...,10,22nd August 2024,United States


In [44]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [45]:
print(df['verified'])

0      False
1       True
2      False
3      False
4       True
       ...  
995     True
996     True
997     True
998     True
999     True
Name: verified, Length: 1000, dtype: bool


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,created_at,country
0,0,Not Verified | Although all four flights were ...,5,26th August 2024,United Kingdom
1,1,✅ Trip Verified | Flight changed just one da...,9,26th August 2024,United Kingdom
2,2,Not Verified | Delayed flight 1 hour. Arbitrar...,1,25th August 2024,United Kingdom
3,3,Not Verified | Flown with BA four times. As an...,2,25th August 2024,United Kingdom
4,4,✅ Trip Verified | You may never see your ref...,10,22nd August 2024,United States


In [5]:
df.dtypes

Unnamed: 0     int64
reviews       object
stars          int64
created_at    object
country       object
dtype: object

Cleaning Dates and formatting them

In [6]:
# convert the date to datetime format and treating missing dates as NaT

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')


In [49]:
df.created_at

0     2024-08-26
1     2024-08-26
2     2024-08-25
3     2024-08-25
4            NaT
         ...    
995   2019-06-27
996   2019-06-26
997   2019-06-26
998   2019-06-24
999   2019-06-24
Name: created_at, Length: 1000, dtype: datetime64[ns]

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,created_at,country
0,0,Not Verified | Although all four flights were ...,5,2024-08-26,United Kingdom
1,1,✅ Trip Verified | Flight changed just one da...,9,2024-08-26,United Kingdom
2,2,Not Verified | Delayed flight 1 hour. Arbitrar...,1,2024-08-25,United Kingdom
3,3,Not Verified | Flown with BA four times. As an...,2,2024-08-25,United Kingdom
4,4,✅ Trip Verified | You may never see your ref...,10,NaT,United States


Cleaning ratings with stars

In [8]:
df['stars']

0       5
1       9
2       1
3       2
4      10
       ..
995     3
996     9
997     2
998    10
999     1
Name: stars, Length: 1000, dtype: int64

In [9]:
df.stars.unique()

array([ 5,  9,  1,  2, 10,  4,  8,  3,  6,  7], dtype=int64)

In [29]:
df.stars.value_counts()

stars
1     372
2     114
3      95
10     81
8      77
9      76
4      56
5      55
7      39
6      35
Name: count, dtype: int64

Check for null Values

In [10]:
df.isnull().value_counts()

Unnamed: 0  reviews  stars  created_at  country
False       False    False  False       False      794
                            True        False      206
Name: count, dtype: int64

In [33]:
df.country.isnull().value_counts()

country
False    1000
Name: count, dtype: int64

In [11]:
#drop the rows using index where the created date  value is null
df.drop(df[df.created_at.isnull() == True].index, axis=0,inplace=True)

In [12]:
df.shape

(794, 5)

In [56]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviews,stars,created_at,country,verified
0,0,Not Verified | Although all four flights were ...,5,2024-08-26,United Kingdom,False
1,1,✅ Trip Verified | Flight changed just one da...,9,2024-08-26,United Kingdom,True
2,2,Not Verified | Delayed flight 1 hour. Arbitrar...,1,2024-08-25,United Kingdom,False
3,3,Not Verified | Flown with BA four times. As an...,2,2024-08-25,United Kingdom,False
7,7,Not Verified | The customer service is one of...,4,2024-08-18,United States,False


In [13]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,reviews,stars,created_at,country
0,0,Not Verified | Although all four flights were ...,5,2024-08-26,United Kingdom
1,1,✅ Trip Verified | Flight changed just one da...,9,2024-08-26,United Kingdom
2,2,Not Verified | Delayed flight 1 hour. Arbitrar...,1,2024-08-25,United Kingdom
3,3,Not Verified | Flown with BA four times. As an...,2,2024-08-25,United Kingdom
4,7,Not Verified | The customer service is one of...,4,2024-08-18,United States
...,...,...,...,...,...
789,995,✅ Trip Verified | London Heathrow to Larnaca....,3,2019-06-27,United Kingdom
790,996,✅ Trip Verified | BA 2616 and 2617 return tri...,9,2019-06-26,United Kingdom
791,997,✅ Trip Verified | Heathrow to Kalamata. Heath...,2,2019-06-26,United Kingdom
792,998,✅ Trip Verified | London to Gothenburg. Uniqu...,10,2019-06-24,United Kingdom


In [14]:
# remove unnamed column
df = df.drop(columns=['Unnamed: 0'])

In [60]:
df.shape

(794, 5)

In [15]:
df.head()

Unnamed: 0,reviews,stars,created_at,country
0,Not Verified | Although all four flights were ...,5,2024-08-26,United Kingdom
1,✅ Trip Verified | Flight changed just one da...,9,2024-08-26,United Kingdom
2,Not Verified | Delayed flight 1 hour. Arbitrar...,1,2024-08-25,United Kingdom
3,Not Verified | Flown with BA four times. As an...,2,2024-08-25,United Kingdom
7,Not Verified | The customer service is one of...,4,2024-08-18,United States


In [16]:
# export the cleaned data
df.to_csv("cleaned_British_Airway_reviews.csv")