# NAO Project: Linear Modelling in the Pooling Framework

## Package Imports

In [1]:
!pip install ftfy

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


In [2]:
!pip install mord

Collecting mord
  Downloading mord-0.7.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mord
  Building wheel for mord (setup.py) ... [?25l[?25hdone
  Created wheel for mord: filename=mord-0.7-py3-none-any.whl size=9885 sha256=6ad868e35fa15777989b7364de4e25230059526d4709e74b1d14294f8cfea5bb
  Stored in directory: /root/.cache/pip/wheels/d1/fc/57/f2a2ad4ed0491ab6d5bb8642a90f1da9469397641e914743da
Successfully built mord
Installing collected packages: mord
Successfully installed mord-0.7


In [3]:
import kagglehub
from ftfy import fix_text
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mord
import re
from textblob import TextBlob
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings('ignore')

## Data Summary

We consider the [`skytrax-airline-reviews`](https://www.kaggle.com/datasets/efehandanisman/skytrax-airline-reviews) from Kaggle. After downloading and cleaning the data, we obtain the following cleaned data:

In [15]:
# Download latest version
path = kagglehub.dataset_download("efehandanisman/skytrax-airline-reviews")

# Define the full path to the Excel file
excel_file = os.path.join(path, "capstone_airline_reviews3.xlsx")
survey_df = pd.read_excel(excel_file)
# Dataset Duplicate Value Count
survey_df.duplicated(keep = 'first').sum()
#Dropping the Empty rows
survey_df.drop_duplicates(keep=False,inplace= True)
survey_df.reset_index(inplace=True,drop=True)
survey_df['customer_review'] = survey_df['customer_review'].apply(fix_text)
survey_df = survey_df[survey_df.isna().sum(axis=1) <= 3] # remove entries
# Drop unnecessary columns (including 'aircraft' which had 17k NaN values)
columns_to_drop = ['author', 'customer_review', 'route', 'review_date', 'date_flown', 'aircraft']
survey_df = survey_df.drop(columns=columns_to_drop)
# Convert 'recommended' column to binary (1 for 'yes', 0 for 'no')
survey_df['recommended'] = survey_df['recommended'].map({'yes': 1, 'no': 0})
survey_df.dropna(inplace=True)
survey_df.to_csv('capstone_airline_reviews3_cleaned.csv', index=False)
survey_df = pd.read_csv("capstone_airline_reviews3_cleaned.csv", encoding="utf-8-sig")
survey_df.head()

Unnamed: 0,airline,overall,traveller_type,cabin,seat_comfort,cabin_service,food_bev,entertainment,ground_service,value_for_money,recommended
0,Turkish Airlines,7.0,Business,Economy Class,4.0,5.0,4.0,4.0,2.0,4.0,1
1,Turkish Airlines,2.0,Family Leisure,Economy Class,4.0,1.0,1.0,1.0,1.0,1.0,0
2,Turkish Airlines,3.0,Business,Economy Class,1.0,4.0,1.0,3.0,1.0,2.0,0
3,Turkish Airlines,10.0,Solo Leisure,Economy Class,4.0,5.0,5.0,5.0,5.0,5.0,1
4,Turkish Airlines,1.0,Solo Leisure,Economy Class,1.0,1.0,1.0,1.0,1.0,1.0,0
