In [1]:
#importing all the models and functions from packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
#reaing train and test data
train=pd.read_csv('Train.csv')
test=pd.read_csv('Test.csv')
train.head()

Unnamed: 0,Origin Airport Code,Destination Airport Code,Origin Latitude,Origin Longitude,Destination Latitude,Destination Longitude,Great Circle Distance,Timezone Difference,Continent Origin,Continent Destination,Route Popularity,Flight_Distance
0,ORG754,DST883,-0.283823,-43.312707,38.508357,24.024226,8076.863942,1,South America,Europe,397,8120.606391
1,ORG301,DST627,25.954354,82.253018,40.630074,145.859627,6000.224245,11,Asia,Asia,650,5998.108994
2,ORG501,DST681,1.386828,-81.540082,1.265511,82.773779,18246.26383,9,South America,Asia,655,18207.57227
3,ORG310,DST812,23.367517,34.904649,-24.384679,121.948601,10777.98486,-2,Africa,Australia,740,10813.85026
4,ORG608,DST878,3.328282,-16.364917,40.396593,45.871911,7442.778451,-2,Africa,Asia,173,7426.288895


In [3]:
train.isnull().sum() #checking null value of train data

Origin Airport Code         0
Destination Airport Code    0
Origin Latitude             0
Origin Longitude            0
Destination Latitude        0
Destination Longitude       0
Great Circle Distance       0
Timezone Difference         0
Continent Origin            0
Continent Destination       0
Route Popularity            0
Flight_Distance             0
dtype: int64

In [4]:
test.isnull().sum() #checking null value of test data

Origin Airport Code           0
Destination Airport Code      0
Origin Latitude               0
Origin Longitude              0
Destination Latitude          0
Destination Longitude         0
Great Circle Distance         0
Timezone Difference           0
Continent Origin              0
Continent Destination         0
Route Popularity              0
Flight_Distance             200
dtype: int64

In [5]:
print(train.shape) #checking the shape of train data
print(test.shape) #checking the shape of test data

(19800, 12)
(200, 12)


In [6]:
#considering only the required features for train data
columns=['Origin Latitude','Origin Longitude','Destination Latitude','Destination Longitude','Great Circle Distance']
X_train=train[columns]
#seperating target variable from train dataset
y_train=train[['Flight_Distance']]
#considering only the required features for test data
X_test=test[columns]
#seperating target variable from test dataset
y_test=test[['Flight_Distance']]
print(X_train.head())
print(y_train.head())

   Origin Latitude  Origin Longitude  Destination Latitude  \
0        -0.283823        -43.312707             38.508357   
1        25.954354         82.253018             40.630074   
2         1.386828        -81.540082              1.265511   
3        23.367517         34.904649            -24.384679   
4         3.328282        -16.364917             40.396593   

   Destination Longitude  Great Circle Distance  
0              24.024226            8076.863942  
1             145.859627            6000.224245  
2              82.773779           18246.263830  
3             121.948601           10777.984860  
4              45.871911            7442.778451  
   Flight_Distance
0      8120.606391
1      5998.108994
2     18207.572270
3     10813.850260
4      7426.288895


In [7]:
#seperating features and target variables for train data
X=train[columns] 
y=train[['Flight_Distance']]
#splitting the train data as train and validation dataset with 80% as training data and 20% as validation data
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2)
#seperating features and target variables for validation data
X_val=X_val[columns]
y_val=y_val[['Flight_Distance']]

print(y_val.head())

       Flight_Distance
10358     12852.515190
10390      9973.658350
4212       4180.994111
19377     14244.985800
13242      9269.197224


In [None]:
from sklearn.tree import DecisionTreeRegressor
DecisionTreeRegressor

In [43]:
#parameters of model(i.e. RandomForestRegressor)
param_grid={
    "n_estimators":[50,100,150],
    "max_depth":[25,50,75],
    "min_samples_leaf":[5,15,25],
    "max_leaf_nodes":[10,30,50],
    "min_samples_split":[2,4,6]
}
#Defining RandomForestRegressor model
model=RandomForestRegressor()
#Defining GridSearchCV tool with estimator, param_grid, cv, njobs as parameters
#model -> model used for hyperparameter tuning
#param_grid -> parameters of model with various values as list for tuning
#cv -> cross validation of data i.e. no. of folds
#n_jobs -> no of cores used for hyperparameter tuning (n_jobs=-1 signifies using all the cores)
grid=GridSearchCV(estimator=model,param_grid=param_grid,cv=5,n_jobs=-1)


In [47]:
import numpy as np
grid.fit(X_train,np.array(y_train).ravel()) #fitting the training data with gridsearchcv tool [target variable must be passsed as 1D-array. 
#Ravel is used to flatten a multidimensional into a 1D-array]
best_parameters=grid.best_params_ # after multiple combinations of paramters passed gridsearchcv gives the best parameters suited for the model
print(best_parameters)

{'max_depth': 25, 'max_leaf_nodes': 50, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 150}


In [48]:
best_grid=grid.best_estimator_ #stores the best parameters of the model after training dataset in to the attribute and used during predicting
print(best_grid) 
pred=best_grid.predict(X_val) #predicting the validation data with the best parameters of the model
pred=pd.DataFrame(pred) # transforming the predicted target variable into a dataframe
accuracy=mean_absolute_error(y_val,pred) #calculating the mean_absolute_error of the actual target variable and predicted variable
print(accuracy)

RandomForestRegressor(max_depth=25, max_leaf_nodes=50, min_samples_leaf=5,
                      n_estimators=150)
57.87513460736089


In [49]:
best_grid #stores the model with the best parameters

In [50]:
test_predict=best_grid.predict(X_test) #predicting the test data with the best parameters of the model
y_test=pd.DataFrame(test_predict,columns=['Flight_Distance']) # transforming the predicted target variable into a dataframe
y_test.to_csv('Submission.csv',index=False) #storing the predicted variable in the dataset
print("Done")

Done


In [51]:
y_test

Unnamed: 0,Flight_Distance
0,14330.709132
1,10385.708408
2,4316.335893
3,14900.938182
4,9555.818309
...,...
195,12538.839629
196,3291.726225
197,17143.189532
198,4543.246746
