In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [97]:
df = pd.read_csv('Flight_Price_Dataset_Q2.csv')
df

Unnamed: 0,departure_time,stops,arrival_time,class,duration,days_left,price
0,Evening,zero,Night,Economy,2.17,1,5953
1,Early_Morning,zero,Morning,Economy,2.33,1,5953
2,Early_Morning,zero,Early_Morning,Economy,2.17,1,5956
3,Morning,zero,Afternoon,Economy,2.25,1,5955
4,Morning,zero,Morning,Economy,2.33,1,5955
...,...,...,...,...,...,...,...
270133,Early_Morning,one,Night,Business,17.25,49,68739
270134,Morning,one,Evening,Business,10.08,49,69265
270135,Afternoon,one,Night,Business,10.42,49,77105
270136,Early_Morning,one,Evening,Business,10.00,49,81585


In [98]:
df = pd.get_dummies(df, columns=['class'], drop_first=True)
mapping = {'Early_Morning': 0,
           'Morning' : 1,
           'Afternoon' : 2,
           'Evening' : 3,
          'Night': 4,
          'Late_Night': 5}

df['arrival_time_encode'] = df['arrival_time'].map(mapping)
df['departure_time_encode'] = df['departure_time'].map(mapping)
df = df.drop('arrival_time', axis=1)
df = df.drop('departure_time', axis = 1)

mapping = {'zero': 0,
           'one' : 1,
           'two_or_more' : 2}

df['stops_encode'] = df['stops'].map(mapping)
df = df.drop('stops', axis = 1)

df['class_Economy'] = df['class_Economy'].astype(int)

df

Unnamed: 0,duration,days_left,price,class_Economy,arrival_time_encode,departure_time_encode,stops_encode
0,2.17,1,5953,1,4,3,0
1,2.33,1,5953,1,1,0,0
2,2.17,1,5956,1,0,0,0
3,2.25,1,5955,1,2,1,0
4,2.33,1,5955,1,1,1,0
...,...,...,...,...,...,...,...
270133,17.25,49,68739,0,4,0,1
270134,10.08,49,69265,0,3,1,1
270135,10.42,49,77105,0,4,2,1
270136,10.00,49,81585,0,3,0,1


In [100]:
X_df = df.drop(columns=['price'])
y_df = df['price']

In [101]:
X_df

Unnamed: 0,duration,days_left,class_Economy,arrival_time_encode,departure_time_encode,stops_encode
0,2.17,1,1,4,3,0
1,2.33,1,1,1,0,0
2,2.17,1,1,0,0,0
3,2.25,1,1,2,1,0
4,2.33,1,1,1,1,0
...,...,...,...,...,...,...
270133,17.25,49,0,4,0,1
270134,10.08,49,0,3,1,1
270135,10.42,49,0,4,2,1
270136,10.00,49,0,3,0,1


In [102]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270138 entries, 0 to 270137
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   duration               270138 non-null  float64
 1   days_left              270138 non-null  int64  
 2   class_Economy          270138 non-null  int64  
 3   arrival_time_encode    270138 non-null  int64  
 4   departure_time_encode  270138 non-null  int64  
 5   stops_encode           270138 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 12.4 MB


In [103]:
y_df

0          5953
1          5953
2          5956
3          5955
4          5955
          ...  
270133    68739
270134    69265
270135    77105
270136    81585
270137    81585
Name: price, Length: 270138, dtype: int64

In [104]:
y_df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 270138 entries, 0 to 270137
Series name: price
Non-Null Count   Dtype
--------------   -----
270138 non-null  int64
dtypes: int64(1)
memory usage: 2.1 MB


In [105]:
x_train, x_test , y_train , y_test = train_test_split(X_df, y_df, test_size=0.20, random_state=5)

In [106]:
x_train

Unnamed: 0,duration,days_left,class_Economy,arrival_time_encode,departure_time_encode,stops_encode
147146,29.67,35,1,3,1,1
95531,10.00,26,1,0,4,1
109471,5.83,15,1,2,1,1
130674,7.33,24,1,4,2,1
124939,8.92,24,1,4,2,1
...,...,...,...,...,...,...
136592,2.42,40,1,4,3,0
232422,13.50,37,0,1,4,1
124605,11.17,22,1,4,1,1
20463,7.83,16,1,2,0,1


In [107]:
x_test

Unnamed: 0,duration,days_left,class_Economy,arrival_time_encode,departure_time_encode,stops_encode
11154,5.75,14,1,4,3,1
94855,12.17,21,1,5,2,1
143147,16.83,6,1,1,3,1
39825,3.67,9,1,1,0,1
3170,5.42,19,1,2,1,1
...,...,...,...,...,...,...
24798,8.00,42,1,0,4,1
56152,19.75,7,1,0,1,1
70241,25.92,7,1,1,0,1
47927,8.50,4,1,4,1,1


In [92]:
model = LinearRegression()
model.fit(x_train, y_train)

In [93]:
y_pred = model.predict(x_test)

In [94]:
print('R^2:',metrics.r2_score(y_test, y_pred))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R^2: 0.9017844944123111
MAE: 4616.909502587092
MSE: 50853788.384245865
RMSE: 7131.1842203273545


In [83]:
y_pred

array([ 8500.55136268,  8454.08783126,  9488.66818654, ...,
       10221.91820571, 10120.84212229, 50754.16810155])

In [85]:
y_test

11154      7424
94855      3393
143147     7774
39825      5943
3170       4926
          ...  
24798      7366
56152      9828
70241      9201
47927     11129
251905    45883
Name: price, Length: 54028, dtype: int64