In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler


In [None]:
raw_df = pd.read_csv(f'{Path.cwd()}/../datasets/air_passenger_original.csv')

In [None]:
raw_df.shape

In [None]:
raw_df.info()

In [None]:
# remove columns that do not add anything to the learning process of model
raw_df.drop(['Unnamed: 0', 'id'], axis=1, inplace=True)

In [None]:
raw_df.info()

In [None]:
raw_df.describe()

In [None]:
raw_df_features = raw_df.drop(columns=["satisfaction"])
raw_df_labels = raw_df[["satisfaction"]]
raw_df_features.shape, raw_df_labels.shape

In [None]:
# apply stratified sampling to the dataset to lower the number of rows to n samples
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_features,
    raw_df_labels,
    test_size=50000,
    random_state=42, 
    stratify=raw_df_labels.to_numpy()
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
categorical_features = [
    'Class',
    'Inflight wifi service',
    'Departure/Arrival time convenient',
    'Ease of Online booking',
    'Gate location',
    'Food and drink',
    'Online boarding',
    'Seat comfort',
    'Inflight entertainment',
    'On-board service',
    'Leg room service',
    'Baggage handling',
    'Checkin service',
    'Inflight service',
    'Cleanliness'
]

binary_features = ['Gender', 'Customer Type', 'Type of Travel']

numerical_features = X_test.columns.difference(categorical_features + binary_features + ['satisfaction'])
numerical_features

In [None]:
X_test.isna().sum()

In [None]:
mode_value = X_test['Arrival Delay in Minutes'].mode()[0]
X_test['Arrival Delay in Minutes'] = X_test['Arrival Delay in Minutes'].fillna(mode_value)

In [None]:
X_test.isna().sum().sum()

In [None]:
y_test.value_counts()

In [None]:
y_test['satisfaction'] = y_test['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})
y_test['satisfaction'].value_counts()

In [None]:
# standard scale numerical features
# scaler = StandardScaler()
# raw_df[numerical_features] = scaler.fit_transform(raw_df[numerical_features])

# raw_df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

# standard scale numerical columns
for col in numerical_features:
    _scaler = StandardScaler()
    X_test[col] = _scaler.fit_transform(X_test[col].to_numpy().reshape(-1, 1))

In [None]:
for col in X_test.columns.tolist():
    if col in binary_features:
        _encoder = LabelEncoder()
        _encoder.fit(X_test[col])

        # Some logging
        print(f"Number Unique Classes of LabelEncoding in column {col}: {len(np.unique(_encoder.classes_))}")
        
        X_test[col] = _encoder.fit_transform(X_test[col])

In [None]:
X_test = pd.get_dummies(X_test, columns=categorical_features, prefix_sep='__')

In [None]:
X_test

In [None]:
merged_df = pd.concat([X_test, y_test], axis=1)
merged_df

In [None]:
for i in merged_df.columns:
    print(f"- \"{i}\"")

In [None]:
X_test.info()

In [None]:
y_test.value_counts('satisfaction')

In [None]:
merged_df.to_csv(f'{Path.cwd()}/../datasets/air_passenger_preprocessed.csv', index=False)