# 1. Importing Libraries

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from feature_engine.datetime import DatetimeFeatures


# 2. Display Settings

In [6]:
pd.set_option("display.max_columns", None)

In [7]:
# Convert all the output of sklearn from numpy to pandas
sklearn.set_config(transform_output="pandas")

# 3. Getting the Data

In [8]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")

In [9]:
train_df

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included,4995
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info,8372
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info,6117
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info,7770
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info,9187
...,...,...,...,...,...,...,...,...,...,...
635,Air Asia,2019-04-12,Banglore,Delhi,04:55:00,07:45:00,170,0.0,No Info,4282
636,Jet Airways,2019-05-09,Kolkata,Banglore,09:35:00,21:05:00,690,1.0,No Info,13067
637,Indigo,2019-05-15,Banglore,Delhi,06:05:00,08:50:00,165,0.0,No Info,4423
638,Multiple Carriers,2019-05-15,Delhi,Cochin,08:45:00,21:00:00,735,1.0,No Info,7670


## 3.1 Split the data

In [10]:
X = train_df.loc[:, train_df.columns != 'price']
y = train_df[['price']]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, test_size=0.20, random_state=42)

In [11]:
X

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info
0,Jet Airways,2019-06-21,Mumbai,Hyderabad,10:20:00,11:50:00,90,0.0,In-flight meal not included
1,Air India,2019-05-18,Delhi,Cochin,09:00:00,07:40:00,1360,1.0,No Info
2,Air India,2019-06-12,Kolkata,Banglore,09:10:00,11:05:00,1555,2.0,No Info
3,Vistara,2019-04-01,Kolkata,Banglore,20:20:00,22:55:00,1595,1.0,No Info
4,Vistara,2019-06-06,Kolkata,Banglore,17:00:00,10:45:00,1065,1.0,No Info
...,...,...,...,...,...,...,...,...,...
635,Air Asia,2019-04-12,Banglore,Delhi,04:55:00,07:45:00,170,0.0,No Info
636,Jet Airways,2019-05-09,Kolkata,Banglore,09:35:00,21:05:00,690,1.0,No Info
637,Indigo,2019-05-15,Banglore,Delhi,06:05:00,08:50:00,165,0.0,No Info
638,Multiple Carriers,2019-05-15,Delhi,Cochin,08:45:00,21:00:00,735,1.0,No Info


In [12]:
X_val = val_df.loc[:, val_df.columns != 'price']
y_val = val_df[['price']]

In [13]:
X_test = test_df.loc[:, test_df.columns != 'price']
y_test = test_df[['price']]

## 3.2 Meta-info()

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 512 entries, 409 to 102
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          512 non-null    object 
 1   date_of_journey  512 non-null    object 
 2   source           512 non-null    object 
 3   destination      512 non-null    object 
 4   dep_time         512 non-null    object 
 5   arrival_time     512 non-null    object 
 6   duration         512 non-null    int64  
 7   total_stops      512 non-null    float64
 8   additional_info  512 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 40.0+ KB


# 4. Data Preprocessing


In [36]:
# Extracting the name of the number columns
num_cols = list(X_train.select_dtypes(include=np.number).columns)
# Extracting time columns
dt_cols = ["date_of_journey", "arrival_time", "dep_time"]
# Extracting categorical column
cat_cols = list(set(X_train.columns) - set(num_cols) - set(dt_cols))

In [37]:
num_cols

['duration', 'total_stops']

In [38]:
cat_cols

['destination', 'airline', 'source', 'additional_info']

In [39]:
# Creating a Pipeline for Categorical, Numerical Columns
num_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())]
                   )

cat_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse_output=False))]
                   )
doj_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ("extractor", DatetimeFeatures(features_to_extract=["month", "week", "day_of_week", "day_of_month"], format = "mixed")),
    ('scaler', StandardScaler())]
                          )
time_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ("extractor", DatetimeFeatures(features_to_extract=["hour", "minute"], format = "mixed")),
    ('scaler', StandardScaler())]
                          )

In [34]:
cat_cols

['destination', 'airline', 'source', 'additional_info']

In [40]:
preprocessor = ColumnTransformer(transformers = [
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("time", time_transformer, ["dep_time", "arrival_time"])
])

In [41]:
preprocessor.fit_transform(X_train)

Unnamed: 0,num__duration,num__total_stops,cat__destination_Banglore,cat__destination_Cochin,cat__destination_Delhi,cat__destination_Hyderabad,cat__destination_Kolkata,cat__destination_New Delhi,cat__airline_Air Asia,cat__airline_Air India,cat__airline_Goair,cat__airline_Indigo,cat__airline_Jet Airways,cat__airline_Multiple Carriers,cat__airline_Spicejet,cat__airline_Vistara,cat__source_Banglore,cat__source_Chennai,cat__source_Delhi,cat__source_Kolkata,cat__source_Mumbai,cat__additional_info_In-flight meal not included,cat__additional_info_No Info,cat__additional_info_No check-in baggage included,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_month,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute
409,-0.981393,-1.228118,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.553058,-0.406511,1.541962,0.966145,1.732450,1.503450,-1.860022,-0.295628
250,0.626632,0.305532,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.287196,0.164762,0.092480,-0.477433,-1.489152,1.776805,1.125342,-1.501694
429,0.754879,1.839181,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.393311,-1.168208,-1.357002,0.605250,0.837561,0.136677,-0.516608,0.608922
432,-0.981393,-1.228118,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.127450,1.307307,0.575641,0.966145,0.300627,0.136677,0.379001,1.814988
470,0.597036,0.305532,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.393311,-1.358632,-0.873842,-0.116539,1.374495,0.683386,-0.218072,0.005889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.320426,0.305532,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.393311,-1.739481,1.541962,-1.199223,0.479605,0.683386,1.423879,0.910439
106,1.652611,0.305532,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.127450,0.926459,0.092480,-0.838328,1.016539,-0.410032,0.826806,-1.501694
270,1.445442,1.839181,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.127450,1.497731,0.092480,1.687934,1.374495,-0.410032,0.826806,-0.597144
435,-0.576921,0.305532,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.287196,0.355186,-0.390681,0.244356,-0.952218,-1.230095,-0.218072,1.814988
