# Data Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

Create a dataframe from csv file

In [4]:
cwd = os.getcwd()
df = pd.read_csv(cwd+"/BA_reviews.csv", index_col = 0)

In [5]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | We chose Rotterdam and Lond...,5,12th April 2024,(Netherlands)
1,✅ Trip Verified | The entire booking and ticke...,6,10th April 2024,(United States)
2,Not Verified | British Airways cancelled my ...,1,10th April 2024,(United States)
3,Not Verified | I wanted to write this review s...,1,7th April 2024,(United States)
4,✅ Trip Verified | Check in fast and friendly....,10,3rd April 2024,(United Kingdom)


Create a column to check if the user is verified or not

In [7]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0       True
1       True
2      False
3      False
4       True
       ...  
345     True
346     True
347     True
348    False
349     True
Name: verified, Length: 350, dtype: bool

Cleaning Reviews (NLP)

In [21]:
!pip install nltk
import nltk
nltk.download('omw-1.4')





[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\FAZIL

KeyboardInterrupt: 

In [25]:
#for lemmatization of words we will use nltk library

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\FAZIL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
df['corpus'] = corpus

In [24]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | We chose Rotterdam and Lond...,5,12th April 2024,(Netherlands),True,chose rotterdam london city airport convenienc...
1,✅ Trip Verified | The entire booking and ticke...,6,10th April 2024,(United States),True,entire booking ticketing experience stressful ...
2,Not Verified | British Airways cancelled my ...,1,10th April 2024,(United States),False,verified british airway cancelled flight le ho...
3,Not Verified | I wanted to write this review s...,1,7th April 2024,(United States),False,verified wanted write review could give huge t...
4,✅ Trip Verified | Check in fast and friendly....,10,3rd April 2024,(United Kingdom),True,check fast friendly security breeze lounge bus...


Cleaning/Format Date

In [26]:
df.dtypes

reviews     object
stars        int64
date        object
country     object
verified      bool
corpus      object
dtype: object

Convert the date to datetome format

In [27]:
df.date = pd.to_datetime(df.date)

In [28]:
df.date.head()

0   2024-04-12
1   2024-04-10
2   2024-04-10
3   2024-04-07
4   2024-04-03
Name: date, dtype: datetime64[ns]

Cleaning the ratings with stars

In [31]:
df.stars.unique()

array([ 5,  6,  1, 10,  7,  3,  4,  8], dtype=int64)

In [33]:
df.stars.value_counts()

1     95
8     63
5     32
6     32
10    32
7     32
3     32
4     32
Name: stars, dtype: int64

Dropping the None ratings

In [34]:
df.drop(df[df.stars == "None"].index, axis = 0, inplace = True)

In [35]:
df.stars.unique()

array([ 5,  6,  1, 10,  7,  3,  4,  8], dtype=int64)

Check for Null Values

In [36]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     350
dtype: int64

In [37]:
df.shape

(350, 6)

Resetting the Index

In [38]:
df.reset_index(drop = True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | We chose Rotterdam and Lond...,5,2024-04-12,(Netherlands),True,chose rotterdam london city airport convenienc...
1,✅ Trip Verified | The entire booking and ticke...,6,2024-04-10,(United States),True,entire booking ticketing experience stressful ...
2,Not Verified | British Airways cancelled my ...,1,2024-04-10,(United States),False,verified british airway cancelled flight le ho...
3,Not Verified | I wanted to write this review s...,1,2024-04-07,(United States),False,verified wanted write review could give huge t...
4,✅ Trip Verified | Check in fast and friendly....,10,2024-04-03,(United Kingdom),True,check fast friendly security breeze lounge bus...
...,...,...,...,...,...,...
345,✅ Trip Verified | This is the first time I ha...,10,2024-04-02,(United Kingdom),True,first time seen new club world suite seat comf...
346,✅ Trip Verified | Flew business class from Do...,7,2024-04-02,(United Kingdom),True,flew business class doha london st march servi...
347,✅ Trip Verified | Starting off at Heathrow Te...,3,2024-03-28,(United Kingdom),True,starting heathrow terminal check fairly easy f...
348,Not Verified | We have flown this route with ...,4,2024-03-28,(United Kingdom),False,verified flown route easyjet regularly twenty ...


Exporting the clean data

In [39]:
df.to_csv(cwd + "/cleaned-BA-reviews.csv")