In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#read train dataset

df_train=pd.read_excel('./Flight_Ticket_Participant_Datasets/Data_Train.xlsx')
df_train.head()

In [None]:
#read test dataset

df_test=pd.read_excel('./Flight_Ticket_Participant_Datasets/Test_set.xlsx')
df_test.head()

In [None]:
#train data information
df_train.info()

In [None]:
#data description
#mean median std etc.
df_train.describe()

In [None]:
#count no of null values in each column
df_train.shape,df_train.isnull().sum()

In [None]:
#only 1 records so can be drop that record
#drop null values
df_train=df_train.dropna(axis=0)
df_train.shape

In [None]:
#format date of journet
#day of travel
#month of travel
#year of travel is same so no need
#add extra column week day-- 1 mean week day 0 mean weekend

df_train['Day_of_Travel']=pd.to_datetime(df_train.Date_of_Journey,format="%d/%m/%Y").dt.day
df_test['Day_of_Travel']=pd.to_datetime(df_test.Date_of_Journey,format="%d/%m/%Y").dt.day

df_train['Month_of_Travel']=pd.to_datetime(df_train.Date_of_Journey,format="%d/%m/%Y").dt.month
df_test['Month_of_Travel']=pd.to_datetime(df_test.Date_of_Journey,format="%d/%m/%Y").dt.month

df_train['Weekday'] = ((pd.DatetimeIndex(df_train.Date_of_Journey).dayofweek) // 5 == 1).astype(int)
df_test['Weekday'] = ((pd.DatetimeIndex(df_test.Date_of_Journey).dayofweek) // 5 == 1).astype(int)

df_train=df_train.drop(columns=['Date_of_Journey'])
df_test=df_test.drop(columns=['Date_of_Journey'])

In [None]:
#format journey time arrival and departure

#departure
df_train['Depart_Hour']=pd.to_datetime(df_train.Dep_Time).dt.hour
df_train['Depart_Minute']=pd.to_datetime(df_train.Dep_Time).dt.minute

df_test['Depart_Hour']=pd.to_datetime(df_test.Dep_Time).dt.hour
df_test['Depart_Minute']=pd.to_datetime(df_test.Dep_Time).dt.minute


df_train['Arrival_Hour']=pd.to_datetime(df_train.Arrival_Time).dt.hour
df_train['Arrival_Minute']=pd.to_datetime(df_train.Arrival_Time).dt.minute

df_test['Arrival_Hour']=pd.to_datetime(df_test.Arrival_Time).dt.hour
df_test['Arrival_Minute']=pd.to_datetime(df_test.Arrival_Time).dt.minute

df_train=df_train.drop(columns=['Dep_Time'])
df_test=df_test.drop(columns=['Dep_Time'])

df_train=df_train.drop(columns=['Arrival_Time'])
df_test=df_test.drop(columns=['Arrival_Time'])

In [None]:
#add extra two column wheter journey at night time or early morning time

#we have seen some flights have less fare at early morning and high price at late night so direct relation

df_train['Night_Journey']=(df_train['Depart_Hour']>=20).astype(int)
df_train['Early_Morning_Journey']=((df_train['Depart_Hour']>=4) & (df_train['Depart_Hour']<=8)).astype(int)

df_test['Night_Journey']=(df_train['Depart_Hour']>=20).astype(int)
df_test['Early_Morning_Journey']=((df_train['Depart_Hour']>=4) & (df_train['Depart_Hour']<=8)).astype(int)

In [None]:
#Flight duration
#converting duration hour and minute into seprate columns
#function for training and test also

def extract_hour_minute(duration):
    hour=[]
    minute=[]

    for time in duration:
        minu=time.split(' ')
        count=len(minu)
        if count==2:
            h=minu[0].split('h')[0]
            hour.append(int(h))
            m=minu[1].split('m')[0]
            minute.append(int(m))
        elif minu[0][-1]=='m':
            hour.append(0)
            m=minu[0].split('m')[0]
            minute.append(int(m))
           #print(m)    
        elif minu[0][-1]=='h':
            minute.append(0)
            h=minu[0].split('h')[0]
            hour.append(int(h))
           #print(h)
    return hour,minute

In [None]:
#lets call function and add to new columns

df_train['Travel_Hour'],df_train['Travel_Minute']=extract_hour_minute(df_train['Duration'])
df_test['Travel_Hour'],df_test['Travel_Minute']=extract_hour_minute(df_test['Duration'])

df_train=df_train.drop(columns=['Duration'])
df_test=df_test.drop(columns=['Duration'])

In [None]:
#categorical data visualization 

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.countplot(y=df_train['Airline'])
plt.subplot(1,2,2)
sns.countplot(y=df_train['Total_Stops'])
plt.show()

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.countplot(x=df_train['Source'])
plt.xticks(rotation=45)

plt.subplot(1,2,2)
sns.countplot(x=df_train['Destination'])
plt.xticks(rotation=45)
plt.show()

In [None]:
#histogram to check the data despersion

df_train.hist(figsize=(12,9))
plt.show()

In [None]:
#check the price and travel hour pair plot
sns.pairplot(data=df_train,vars=['Price','Travel_Hour'])

In [None]:
#box plot to find the outlier
plt.subplot(2,2,1)
sns.boxplot(x=['Price'],data=df_train)
plt.subplot(2,2,2)
sns.boxplot(x=['Travel_Hour'],data=df_train)

In [None]:
#price outlier check
Q1=df_train['Price'].quantile(0.25)
Q3=df_train['Price'].quantile(0.75)
IQR=Q3-Q1

print(Q1)
print(Q3)
print(IQR)

#price outlier removed
df_train=df_train[~((df_train['Price']>Q3+1.5*IQR)|(df_train['Price']<Q1-1.5*IQR))]
sns.boxplot(x=['Price'],data=df_train)

In [None]:
#check the price and travel hour pair plot
sns.pairplot(data=df_train,vars=['Price','Travel_Hour'])

In [None]:
#lets check any flight who has less travel minute and hour ==0
df_train[((df_train['Travel_Minute']<50) & (df_train['Travel_Hour']==0))]

#one record only five minutes travel from Mumbai to Hyderabad 
#impossible

#remove 
df_train=df_train[~((df_train['Travel_Minute']<50) & (df_train['Travel_Hour']==0))]
df_train[((df_train['Travel_Minute']<50) & (df_train['Travel_Hour']==0))]

#we are done with data preprocssig almost...
#label encoding remain to convert categorical column to unique int values

In [None]:
df_train.dtypes

In [None]:
df_test.dtypes

In [None]:
#get dummies of source and destination city

df_new=pd.concat([df_train.drop('Source',axis=1),pd.get_dummies(df_train['Source'],prefix='Source')],axis=1)
df_new=pd.concat([df_new.drop('Destination',axis=1),pd.get_dummies(df_new['Destination'],prefix='Destination')],axis=1)
df_train=df_new
df_train.head(5)

In [None]:
#columns Airline,Route,Total_Stops,Info can be encode 

df_train
df_test

#masking of object column
categorial_mask1=df_train.dtypes==object
categorial_mask2=df_test.dtypes==object

#categorical columns
categorical_cols1=df_train.columns[categorial_mask1].tolist()
categorical_cols2=df_test.columns[categorial_mask2].tolist()

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le2=LabelEncoder()

df_train[categorical_cols1]=df_train[categorical_cols1].apply(lambda col:le.fit_transform(col))
df_test[categorical_cols2]=df_test[categorical_cols2].apply(lambda col:le2.fit_transform(col))

In [None]:
df_train.head(5)

In [None]:
#feature extraction
X=df_train.drop(columns=['Price'])
y=df_train['Price']

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
#cross validation
from sklearn.model_selection import cross_val_score

#function declaration
def cross_validation(reg_model,X,y):
    
    score=cross_val_score(reg_model,X,y,scoring='neg_mean_squared_error',cv=10)
    rmse_score=np.sqrt(-score)
    print("\nScores ",rmse_score)
    print("Mean ",rmse_score.mean())
    print("Standard Deviation ",rmse_score.std())

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

reg=LinearRegression()
reg.fit(X_train,y_train)
y_pred=reg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

#cross validation function call
cross_validation(reg,X_train,y_train)
pd.DataFrame({'Price':y_test,'Price Predicted':y_pred}).head(10)

In [None]:
from sklearn.svm import SVR

reg=SVR(kernel='linear')
reg.fit(X_train,y_train)
y_pred=reg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

#cross validation function call
cross_validation(reg,X_train,y_train)
pd.DataFrame({'Price':y_test,'Price Predicted':y_pred}).head(10)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

reg=KNeighborsRegressor(n_neighbors=3)
reg.fit(X_train,y_train)
y_pred=reg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

#cross validation function call
cross_validation(reg,X_train,y_train)
pd.DataFrame({'Price':y_test,'Price Predicted':y_pred}).head(10)

In [None]:
from sklearn.ensemble import RandomForestRegressor

reg=RandomForestRegressor(n_estimators=100)
reg.fit(X_train,y_train)
y_pred=reg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

#cross validation function call
cross_validation(reg,X_train,y_train)
pd.DataFrame({'Price':y_test,'Price Predicted':y_pred}).head(10)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

reg=GradientBoostingRegressor(n_estimators=5,learning_rate=1,loss ='ls')
reg.fit(X_train,y_train)
y_pred=reg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

#cross validation function call
cross_validation(reg,X_train,y_train)
pd.DataFrame({'Price':y_test,'Price Predicted':y_pred}).head(10)