In [117]:
import pandas as pd
import numpy as np
from warnings import filterwarnings

# Apply One-Hot Encoding on Certain Columns

In [None]:
data_location = '/home/goldenmeta/Documents/GitHub/Projects/Airline-Ticket-Prediction/data/modified/cleaned_data.csv'
data = pd.read_csv(data_location)
data.head(3)

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Date_of_Journey_Day,Date_of_Journey_Month,Date_of_Journey_Year,Dep_Time_Hour,Dep_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_Hour,Duration_Minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,24,3,2019,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,1,5,2019,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13882,9,6,2019,9,25,4,25,19,0


Updating all new delhi with delhi values (as they are historical the same)

In [119]:
data['Destination'].replace('New Delhi','Delhi',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Destination'].replace('New Delhi','Delhi',inplace=True)


We can apply one hot encoding on which are based on categories, or have common chances of appearing repetitively. In most cases it would be string based values (which are set categories). In this case the following columns will be used one-hot encoded.
* Airline
* Source
* Destination
* Route
* Total_Stops
* Additional_info


Target Guided (assigning each value in the column with a set category instead of one hot encoding.) Furthermore, it differs from label encoding as it encoded numerical values are chosen from the average price column and then sorted creating it's numerical value. 

In [120]:
def target_guided_encoding(data: pd.DataFrame, column: str): 
    sorted_mean_column = data.groupby([column])['Price'].mean().sort_values().index
    key_pair_column = {key: index for index, key in enumerate(sorted_mean_column, 0)}
    data[column] = data[column].map(key_pair_column)

target_guided_encoding(data, 'Airline')
target_guided_encoding(data, 'Source')
target_guided_encoding(data, 'Destination')

Label Encoding using unique labels in the column, start from the first instance of the unique value with 0 and then encode progressively if seen another instances in the same row. else create a new number (normally ordinally)

In [121]:
def label_encoding(data, column):
    unique_values = list(data[column].unique())
    mapping = {category: i for i, category in enumerate(unique_values,0)}
    data[column] = data[column].map(mapping)

label_encoding(data, 'Total_Stops')
label_encoding(data, 'Additional_Info')

Due to the route containing too many unique values, I will not be encoding it and removing the column entirely. 

In [122]:
data.drop(columns=["Route"], axis=1, inplace=True)

In [123]:
data

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Date_of_Journey_Day,Date_of_Journey_Month,Date_of_Journey_Year,Dep_Time_Hour,Dep_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_Hour,Duration_Minute
0,3,2,2,0,0,3897,24,3,2019,22,20,1,10,2,50
1,7,3,3,1,0,7662,1,5,2019,5,50,13,15,7,25
2,10,4,4,1,0,13882,9,6,2019,9,25,4,25,19,0
3,3,3,3,2,0,6218,12,5,2019,18,5,23,30,5,25
4,3,2,2,2,0,13302,1,3,2019,16,50,21,35,4,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10677,2,3,3,0,0,4107,9,4,2019,19,55,22,25,2,30
10678,7,3,3,0,0,4145,27,4,2019,20,45,23,20,2,35
10679,10,2,2,0,0,7229,27,4,2019,8,20,11,20,3,0
10680,5,2,2,0,0,12648,1,3,2019,11,30,14,10,2,40
