# Team Project 3

### Importing the modules

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math
import numpy as np

### Exploring the data

In [70]:
data = pd.read_csv('Clean_Dataset.csv')

In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [72]:
data.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


## Preprocessing

### Dropping unneeded columns
- `Unnamed` - doesnt contain any information
- `flight` - this contains the flight number which is irrelevant

In [73]:
data = data.drop(['Unnamed: 0', 'flight'], axis=1)

In [74]:
data.head()

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


## Splitting the dataset with independent and dependent variables

In [75]:
data.columns

Index(['airline', 'source_city', 'departure_time', 'stops', 'arrival_time',
       'destination_city', 'class', 'duration', 'days_left', 'price'],
      dtype='object')

In [76]:
x = data[['airline', 'source_city', 'departure_time', 'stops', 'arrival_time',
'destination_city', 'class', 'duration', 'days_left']]
y = data[['price']]

### Handle categorical variables

In [77]:
x = pd.get_dummies(data[['airline', 'source_city', 'departure_time', 'stops', 'arrival_time',
'destination_city', 'class', 'duration', 'days_left']], drop_first=True)

In [78]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 30 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   duration                      300153 non-null  float64
 1   days_left                     300153 non-null  int64  
 2   airline_Air_India             300153 non-null  uint8  
 3   airline_GO_FIRST              300153 non-null  uint8  
 4   airline_Indigo                300153 non-null  uint8  
 5   airline_SpiceJet              300153 non-null  uint8  
 6   airline_Vistara               300153 non-null  uint8  
 7   source_city_Chennai           300153 non-null  uint8  
 8   source_city_Delhi             300153 non-null  uint8  
 9   source_city_Hyderabad         300153 non-null  uint8  
 10  source_city_Kolkata           300153 non-null  uint8  
 11  source_city_Mumbai            300153 non-null  uint8  
 12  departure_time_Early_Morning  300153 non-nul

## Training/Test split

In [79]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.40, random_state=42)

## Multiple Linear Regression 

In [80]:
regressor = LinearRegression()
regressor.fit(x_train.values, y_train)

LinearRegression()

In [81]:
print("Coefficients", regressor.coef_)
print("Intercept", regressor.intercept_)

Coefficients [[    44.99177533   -131.70983061     70.35630482   1658.12071656
    2114.83017495   2243.55026698   4055.66084938    -66.63451914
   -1391.49145474  -1644.7549495    1630.01848907   -182.78299148
     797.75087165    736.3557677    1591.93926159    877.41479327
     601.63850448   2076.05250386  -7606.24796549   -738.18019453
     948.86566014    995.2161516     519.11525073   1146.21711676
    -235.81626158  -1559.38919385  -1674.8715848    1370.02949236
     -45.1251909  -44907.13555817]]
Intercept [52556.94256783]


In [82]:
print(regressor.score(x_test, y_test))

0.9117210443104038




## Predicting values with our model

In [83]:
y_pred = regressor.predict(x_test.values)

In [84]:
print(y_pred)

[[ 3458.13028932]
 [55131.58489142]
 [10431.42884423]
 ...
 [51589.23081233]
 [ 6604.83549584]
 [55935.99920633]]


In [85]:
# Convert 2D array to 1D so we can use a dataframe to view results
y_test = np.ravel(y_test)
y_pred = np.ravel(y_pred)

In [89]:
pred_view = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_view.head()

Unnamed: 0,Actual,Predicted
0,7366,3458.130289
1,64831,55131.584891
2,6195,10431.428844
3,60160,54915.776573
4,6578,6558.240775


In [87]:
regressor.score(x_test, y_test)



0.9117210443104038

## RMSE and R squared

In [88]:
print(f"R-Square: {r2_score(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

R-Square: 0.91
MSE: 45536092.63
RMSE: 6748.04
