In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor , StackingRegressor

In [2]:
def rmsle(y_true, y_pred):
    return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())

In [3]:
train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [4]:
X = train.drop(columns=['Price'])
y = train['Price']

In [27]:
train_X, test_X, train_y , test_y = train_test_split(X,y,test_size=0.15,random_state=24)

In [6]:
def run_model_evaluate(model,train_X,train_y,test_X,test_y,X,y,test,filename):
    model.fit(train_X , train_y)
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    print('train_rmsle:', rmsle(train_y , train_pred))
    print('test_rmsle:', rmsle(test_y , test_pred))
    model.fit(X,y)
    pred = model.predict(test)
    result =pd.DataFrame(pred , columns=['Price'])
    result.to_csv(filename, index=False)

In [96]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [97]:
parameters = {'n_estimators':[25,50,75,100] , 'max_depth':[1,2,3,4,5,6],'min_samples_leaf':[1,2,3,4,5]}

In [98]:
%%time
model =RandomForestRegressor()
gridsearch= GridSearchCV(model, parameters, verbose =1, scoring='neg_mean_absolute_error')
gridsearch.fit(train_X, train_y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Wall time: 3min 51s


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'n_estimators': [25, 50, 75, 100]},
             scoring='neg_mean_absolute_error', verbose=1)

In [99]:
gridsearch.best_estimator_

RandomForestRegressor(max_depth=6, min_samples_leaf=3, n_estimators=75)

In [100]:
train_pred = gridsearch.predict(train_X)
rmsle(train_y , train_pred)

0.9183629488182841

In [101]:
test_pred = gridsearch.predict(test_X)
rmsle(test_y , test_pred)

0.9152734733997839

In [102]:
%%time
model =RandomForestRegressor()
randomsearch = RandomizedSearchCV(model, parameters, cv =5, verbose =1, scoring='neg_mean_absolute_error')
randomsearch.fit(train_X, train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Wall time: 23.9 s


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [1, 2, 3, 4, 5, 6],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'n_estimators': [25, 50, 75, 100]},
                   scoring='neg_mean_absolute_error', verbose=1)

In [103]:
randomsearch.best_estimator_

RandomForestRegressor(max_depth=5, min_samples_leaf=3, n_estimators=75)

In [104]:
train_pred = randomsearch.predict(train_X)
rmsle(train_y , train_pred)

0.9084868794896765

In [105]:
test_pred = randomsearch.predict(test_X)
rmsle(test_y , test_pred)

0.9071197941545098

In [84]:
linear_reg = LinearRegression()
run_model_evaluate(linear_reg,train_X,train_y,test_X,test_y,X,y,test,'flight_price_linear_regression.csv')

train_rmsle: 0.8657215077650018
test_rmsle: 0.8476848837880661


  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())


In [85]:
KNN = KNeighborsRegressor()
run_model_evaluate(KNN,train_X,train_y,test_X,test_y,X,y,test,'flight_price_kNN.csv')

train_rmsle: 0.9090685395482615
test_rmsle: 0.8872084217457497


In [38]:
decision_tree = DecisionTreeRegressor(max_depth =12)
run_model_evaluate(decision_tree,train_X,train_y,test_X,test_y,X,y,test,'flight_price_decision_tree.csv')

train_rmsle: 0.951006895642498
test_rmsle: 0.933692160218361


In [None]:
Random_forest = RandomForestRegressor(max_depth = 22, min_samples_leaf=2, n_estimators=250)
run_model_evaluate(Random_forest,train_X,train_y,test_X,test_y,X,y,test,'flight_price_Random_forest.csv')

train_rmsle: 0.9671577138365498
test_rmsle: 0.9462701669502811


In [88]:
Ada_boost = AdaBoostRegressor()
run_model_evaluate(Ada_boost,train_X,train_y,test_X,test_y,X,y,test,'flight_price_Ada_Boost.csv')

train_rmsle: 0.7664415341375617
test_rmsle: 0.7729036193031404


In [22]:
gradient_boost = GradientBoostingRegressor()
run_model_evaluate(gradient_boost,train_X,train_y,test_X,test_y,X,y,test,'flight_price_Ada_Boost.csv')

train_rmsle: 0.9201672555314012
test_rmsle: 0.9189781651484619


In [8]:
model1 =DecisionTreeRegressor()
model2 =RandomForestRegressor()
model4 = GradientBoostingRegressor()

voting = VotingRegressor(estimators=[('dt', model1),('rf',model2),('gra',model4)])

               
               

In [9]:
run_model_evaluate(voting,train_X,train_y,test_X,test_y,X,y,test,'flight_price_voting.csv')

train_rmsle: 0.9661075682932911
test_rmsle: 0.9414155292542026


In [14]:
estimators=[('dt', model1),('rf',model2),('gra',model4)]

In [15]:
# estimators = [('ridge', RidgeCV()),
#         ('lasso', LassoCV(random_state=42)),
#             ('knr', KNeighborsRegressor(n_neighbors=20,
#                                          metric='euclidean'))]

In [16]:
final_estimator = RandomForestRegressor()


In [17]:
stack =StackingRegressor(estimators=estimators, final_estimator=final_estimator)

In [18]:
run_model_evaluate(stack,train_X,train_y,test_X,test_y,X,y,test,'flight_price_stacking.csv')

train_rmsle: 0.9664118301273534
test_rmsle: 0.9424739026586505
