In [1]:
"""......................................................................................................"""
# This library is used for structured data operations, like import CSV files, create data frames, and data preparation
import pandas as pd
#This is a mathematical library. Has a powerful N-dimensional array object, linear algebra, Fourier transform, etc.
import numpy as np
#The Simple Imputer class from sklearn.impute is a useful tool for handling missing values in datasets.
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore",category = DeprecationWarning)


# Set the display options to show more rows and columns
pd.set_option('display.max_columns', None)  # No column limit
pd.set_option('display.max_rows', None)  # No row limit
pd.set_option('display.max_seq_item', None)  # No item truncation for sequences (like lists)



In [2]:
# Load dataset

file_path = '/kaggle/input/british-airways-reviews/british_airways_reviews.csv'  # Update this with the correct file path
df = pd.read_csv(file_path)


print("Missing values in data frame:\n", df.isnull().sum())
print("shape of  Actual data")
print(df.shape)

df= df.dropna(subset=['Route', 'Date Flown','Type of Traveller','Seat Type','Trip Status'])

print("shape of data after applying dropna")
print(df.shape)



Missing values in data frame:
 User Name                           0
Overall Rating                      5
Review                              0
Trip Status                      1523
Detail Review                       0
Type of Traveller                 771
Seat Type                           2
Route                             776
Date Flown                        778
Seat Comfort Rating               127
Staff Service Rating              142
Food & Beverages Rating           438
Inflight Entertainment Rating    1241
Ground Service Rating             850
Value For Money Rating              0
Recommended                         0
dtype: int64
shape of  Actual data
(3920, 16)
shape of data after applying dropna
(2389, 16)


In [3]:
print("Missing values before imputation:\n", df.isnull().sum())

# Define columns that need imputation for ratings
columns_to_impute = ['Seat Comfort Rating', 'Staff Service Rating', 'Food & Beverages Rating',
                     'Inflight Entertainment Rating', 'Ground Service Rating']

# Initialize SimpleImputer with the most frequent strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# Group by 'type_of_traveller' and apply imputation to each group
for traveller_type in df['Type of Traveller'].unique():
    # Select rows for the current traveller type
    subset = df[df['Type of Traveller'] == traveller_type]

    # Apply the imputer only to this subset
    df.loc[df['Type of Traveller'] == traveller_type, columns_to_impute] = imputer.fit_transform(
        subset[columns_to_impute])

print("Missing values after imputation:\n", df.isnull().sum())





Missing values before imputation:
 User Name                          0
Overall Rating                     0
Review                             0
Trip Status                        0
Detail Review                      0
Type of Traveller                  0
Seat Type                          0
Route                              0
Date Flown                         0
Seat Comfort Rating              114
Staff Service Rating             129
Food & Beverages Rating          398
Inflight Entertainment Rating    987
Ground Service Rating             66
Value For Money Rating             0
Recommended                        0
dtype: int64
Missing values after imputation:
 User Name                        0
Overall Rating                   0
Review                           0
Trip Status                      0
Detail Review                    0
Type of Traveller                0
Seat Type                        0
Route                            0
Date Flown                       0
Seat Comfor

In [4]:
#Returns descriptive statistics about the data like mean, minimum, maximum, standard deviation, etc.
print("Descriptive Statistics of the Data Frame:","\n",df.describe())

#info() is used to generate the summary of the Data Frame, this will include info about columns with their names,
#their datatypes, and missing values.
print("Generate the summary of the Data Frame:")
print(df.info())

#df.shape() display Total no of rows and columns data frame
print("Total no of rows and columns Data Frame:",df.shape)



Descriptive Statistics of the Data Frame: 
        Overall Rating  Seat Comfort Rating  Staff Service Rating  \
count     2389.000000          2389.000000           2389.000000   
mean         4.116367             2.656342              3.114692   
std          3.114082             1.350423              1.536351   
min          1.000000             1.000000              1.000000   
25%          1.000000             1.000000              2.000000   
50%          3.000000             3.000000              3.000000   
75%          7.000000             4.000000              5.000000   
max         10.000000             5.000000              5.000000   

       Food & Beverages Rating  Inflight Entertainment Rating  \
count              2389.000000                    2389.000000   
mean                  2.245291                       1.955630   
std                   1.422519                       1.334897   
min                   1.000000                       1.000000   
25%               

In [5]:
# Get the unique values along with their counts for each column
seat_type_counts = df['Seat Type'].value_counts()
route_counts = df['Route'].value_counts()
traveller_type_counts = df['Type of Traveller'].value_counts()

# Print the results
print("Unique values and their counts for 'Seat Type':\n", seat_type_counts)
print("\nUnique values and their counts for 'Route':\n", route_counts)
print("\nUnique values and their counts for 'Type of Traveller':\n", traveller_type_counts)

Unique values and their counts for 'Seat Type':
 Seat Type
Economy Class      1323
Business Class      739
Premium Economy     226
First Class         101
Name: count, dtype: int64

Unique values and their counts for 'Route':
 Route
London to Johannesburg                            19
Johannesburg to London                            17
Vancouver to London                               15
London to Hong Kong                               15
London to Cape Town                               13
Singapore to London                               11
London to Athens                                  11
Hong Kong to London                               11
Cape Town to London                               11
London to Boston                                  10
Los Angeles to London                             10
Toronto to London                                 10
San Francisco to London                           10
London to Istanbul                                 9
London to Dubai          