In [1]:
print("DMBI Project\nFlight Fare Prediction System\nParth Tanna-191310132010\nMahesh Santoki-191310132006")

In [2]:
#Importing Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from prettytable import PrettyTable

In [3]:
#Reading the Training data of our dataset

In [4]:
train_df = pd.read_csv("../input/flight-fare/Data_Train.csv")
train_df.head(10)

In [5]:
#EDA

#looking at the kindof columns
train_df.columns

In [6]:
#info about dataset
train_df.info()

In [7]:
#to know more
train_df.describe()

In [8]:
#null values
train_df.isnull().head()

In [9]:
#number of null values
train_df.isnull().sum()

In [10]:
#dropping NAN values
train_df.dropna(inplace = True)

In [11]:
#duplicates
train_df[train_df.duplicated()].head()

In [12]:
#remove repeated values
train_df.drop_duplicates(keep='first',inplace=True)
train_df.head()

In [13]:
train_df.shape

In [14]:
#checking Additional_Info column and having count of unique type
train_df["Additional_Info"].value_counts()

In [15]:
#checking different airlines
train_df["Airline"].unique()

In [16]:
#checking different airlines routes
train_df["Route"].unique()

In [17]:
#testing data
test_df = pd.read_csv("../input/flight-fare/Test_set.csv")
test_df.head(10)

In [18]:
#kind of columns
test_df.columns

In [19]:
#information about data
test_df.info()

In [20]:
#more info
test_df.describe()

In [21]:
#sum of null values
test_df.isnull().sum()

In [22]:
#data visualization

#price vs airline
sns.catplot(y = "Price", x = "Airline", data = train_df.sort_values("Price", ascending = False), kind="boxen", height = 8, aspect = 3)
plt.show()

In [23]:
#price vs source
sns.catplot(y = "Price", x = "Source", data = train_df.sort_values("Price", ascending = False), kind="violin", height = 4, aspect = 3)
plt.show()

In [24]:
#price vs destination
sns.catplot(y = "Price", x = "Destination", data = train_df.sort_values("Price", ascending = False), kind="box", height = 4, aspect = 3)
plt.show()

In [25]:
#processed data
train_df.head()

In [26]:
#dividing features and label & converting hours in minutes
train_df['Duration'] = train_df['Duration'].str.replace("h", '*60').str.replace(' ','+').str.replace('m','*1').apply(eval)
test_df['Duration'] = test_df['Duration'].str.replace("h", '*60').str.replace(' ','+').str.replace('m','*1').apply(eval)

In [27]:
#organizing format(date)
train_df["Journey_day"] = train_df['Date_of_Journey'].str.split('/').str[0].astype(int)
train_df["Journey_month"] = train_df['Date_of_Journey'].str.split('/').str[1].astype(int)
train_df.drop(["Date_of_Journey"], axis = 1, inplace = True)

In [28]:
#converting to hours and minutes(departure)
train_df["Dep_hour"] = pd.to_datetime(train_df["Dep_Time"]).dt.hour
train_df["Dep_min"] = pd.to_datetime(train_df["Dep_Time"]).dt.minute
train_df.drop(["Dep_Time"], axis = 1, inplace = True)

In [29]:
#converting to hours and minutes(arrival)
train_df["Arrival_hour"] = pd.to_datetime(train_df.Arrival_Time).dt.hour
train_df["Arrival_min"] = pd.to_datetime(train_df.Arrival_Time).dt.minute
train_df.drop(["Arrival_Time"], axis = 1, inplace = True)

In [30]:
train_df.head()

In [31]:
#months vs number of flights
plt.figure(figsize = (10, 5))
plt.title('Count of flights month wise')
ax=sns.countplot(x = 'Journey_month', data = train_df)
plt.xlabel('Month')
plt.ylabel('Count of flights')
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.25, p.get_height()+1), va='bottom', color= 'black')

In [32]:
#airline vs number of flights
plt.figure(figsize = (20,5))
plt.title('Count of flights with different Airlines')
ax=sns.countplot(x = 'Airline', data =train_df)
plt.xlabel('Airline')
plt.ylabel('Count of flights')
plt.xticks(rotation = 45)
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.25, p.get_height()+1), va='bottom', color= 'black')

In [33]:
#ticket price vs airline
plt.figure(figsize = (15,4))
plt.title('Price VS Airlines')
plt.scatter(train_df['Airline'], train_df['Price'])
plt.xticks
plt.xlabel('Airline')
plt.ylabel('Price of ticket')
plt.xticks(rotation = 90)

In [34]:
#corelation
plt.figure(figsize = (15,15))
sns.heatmap(train_df.corr(), annot = True, cmap = "RdYlGn")
plt.show()

In [35]:
#dropping price column
data = train_df.drop(["Price"], axis=1)

In [36]:
#categorical data and numerical data
train_categorical_data = data.select_dtypes(exclude=['int64', 'float','int32'])
train_numerical_data = data.select_dtypes(include=['int64', 'float','int32'])

test_categorical_data = test_df.select_dtypes(exclude=['int64', 'float','int32','int32'])
test_numerical_data  = test_df.select_dtypes(include=['int64', 'float','int32'])
train_categorical_data.head()

In [37]:
#Label Encode and Hot Encode for Categorical Columns
le = LabelEncoder()
train_categorical_data = train_categorical_data.apply(LabelEncoder().fit_transform)
test_categorical_data = test_categorical_data.apply(LabelEncoder().fit_transform)
train_categorical_data.head()

In [38]:
#Concatenating both Categorical Data and Numerical Data
X = pd.concat([train_categorical_data, train_numerical_data], axis=1)
y = train_df['Price']
test_set = pd.concat([test_categorical_data, test_numerical_data], axis=1)
X.head()

In [39]:
y.head()