# Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

# Reading and Merging the already generated data 

In [2]:
training_data = pd.read_csv("../input/30-days-of-ml/train.csv")
testing_data = pd.read_csv("../input/30-days-of-ml/test.csv")

training_data["kfold"] = -1 
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for fold, (train_ind, valid_ind) in enumerate(kf.split(training_data)):
    training_data.loc[valid_ind, "kfold"] = fold

train_1 = pd.read_csv("../input/abcdef/train_pred_1.csv")
train_1.columns = ["id", "pred_1"]
train_2 = pd.read_csv("../input/abcdef/train_pred_2.csv")
train_2.columns = ["id", "pred_2"]
train_3 = pd.read_csv("../input/abcdef/train_pred_3.csv")
train_3.columns = ["id", "pred_3"]
train_4 = pd.read_csv("../input/abcdef/train_pred_4.csv")
train_4.columns = ["id", "pred_4"]
train_5 = pd.read_csv("../input/abcdef/train_pred_5.csv")
train_5.columns = ["id", "pred_5"]

test_1 = pd.read_csv("../input/abcdef/test_pred_1.csv")
test_1.columns = ["id", "pred_1"]
test_2 = pd.read_csv("../input/abcdef/test_pred_2.csv")
test_2.columns = ["id", "pred_2"]
test_3 = pd.read_csv("../input/abcdef/test_pred_3.csv")
test_3.columns = ["id", "pred_3"]
test_4 = pd.read_csv("../input/abcdef/test_pred_4.csv")
test_4.columns = ["id", "pred_4"]
test_5 = pd.read_csv("../input/abcdef/test_pred_5.csv")
test_5.columns = ["id", "pred_5"]

training_data = training_data.merge(train_1, on="id", how="left")
training_data = training_data.merge(train_2, on="id", how="left")
training_data = training_data.merge(train_3, on="id", how="left")
training_data = training_data.merge(train_4, on="id", how="left")
training_data = training_data.merge(train_5, on="id", how="left")

testing_data = testing_data.merge(test_1, on="id", how="left")
testing_data = testing_data.merge(test_2, on="id", how="left")
testing_data = testing_data.merge(test_3, on="id", how="left")
testing_data = testing_data.merge(test_4, on="id", how="left")
testing_data = testing_data.merge(test_5, on="id", how="left")


# Making the Models and saving predictions

## Model-1

In [3]:
sample_submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
features_list = ["pred_1", "pred_2", "pred_3", "pred_4", "pred_5"]
testing_data = testing_data[features_list]

final_test_predictions = []  #list used to save final predictions of different folds.
final_valid_predictions = {}
for fold in range(5):
    #Splitting the training and validation data based on the folds
    x_train = training_data[training_data.kfold!=fold].reset_index(drop=True)
    x_valid = training_data[training_data.kfold==fold].reset_index(drop=True)
    y_train = x_train.target
    y_valid = x_valid.target
    
    valid_ids = x_valid.id.values.tolist()
    
    x_train = x_train[features_list]
    x_valid = x_valid[features_list]
    x_test = testing_data.copy()
    
    #Making the model
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 7000,
        'learning_rate': 0.03,
        'max_depth': 2
    }
    model = XGBRegressor(
        n_jobs=4,
        **params)
    
    #Training the model
    model.fit(x_train, y_train, early_stopping_rounds=300, eval_set=[(x_valid, y_valid)], verbose=1000)
    
    #Making predictions
    validation_prediction = model.predict(x_valid)
    final_valid_predictions.update(dict(zip(valid_ids, validation_prediction)))
    rmse = mean_squared_error(y_valid, validation_prediction, squared=False)
    print("RMSE of Fold "+ str(fold) + ":", rmse)
    
    #Saving test predictions
    test_preds = model.predict(x_test)
    final_test_predictions.append(test_preds)
    
#Saving validation predictions
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("model_1_train_pred.csv", index=False)

#Saving test predictions
sample_submission_data.target  = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission_data.columns = ["id", "pred_1"]
sample_submission_data.to_csv("model_1_test_pred.csv", index=False)



[0]	validation_0-rmse:7.54372
[547]	validation_0-rmse:0.71805
RMSE of Fold 0: 0.7180207955116213
[0]	validation_0-rmse:7.54662
[607]	validation_0-rmse:0.71501
RMSE of Fold 1: 0.71497841321097
[0]	validation_0-rmse:7.54842
[642]	validation_0-rmse:0.71971
RMSE of Fold 2: 0.7196773114628148
[0]	validation_0-rmse:7.54510
[558]	validation_0-rmse:0.71403
RMSE of Fold 3: 0.7140109564671597
[0]	validation_0-rmse:7.54905
[579]	validation_0-rmse:0.71459
RMSE of Fold 4: 0.7145106898591731


Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
239995,False,False,False,False,False
239996,False,False,False,False,False
239997,False,False,False,False,False
239998,False,False,False,False,False


## Model-2

In [4]:
sample_submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
features_list = ["pred_1", "pred_2", "pred_3", "pred_4", "pred_5"]
testing_data = testing_data[features_list]

final_test_predictions = []  #list used to save final predictions of different folds.
final_valid_predictions = {}
for fold in range(5):
    #Splitting the training and validation data based on the folds
    x_train = training_data[training_data.kfold!=fold].reset_index(drop=True)
    x_valid = training_data[training_data.kfold==fold].reset_index(drop=True)
    y_train = x_train.target
    y_valid = x_valid.target
    
    valid_ids = x_valid.id.values.tolist()
    
    x_train = x_train[features_list]
    x_valid = x_valid[features_list]
    x_test = testing_data.copy()
    
    #Making the model
    model = RandomForestRegressor(n_estimators=500, n_jobs=-1, max_depth=3)
    
    #Training the model
    model.fit(x_train, y_train)
    
    #Making predictions
    validation_prediction = model.predict(x_valid)
    final_valid_predictions.update(dict(zip(valid_ids, validation_prediction)))
    rmse = mean_squared_error(y_valid, validation_prediction, squared=False)
    print("RMSE of Fold "+ str(fold) + ":", rmse)
    
    #Saving test predictions
    test_preds = model.predict(x_test)
    final_test_predictions.append(test_preds)
    
#Saving validation predictions
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("model_2_train_pred.csv", index=False)

#Saving test predictions
sample_submission_data.target  = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission_data.columns = ["id", "pred_2"]
sample_submission_data.to_csv("model_2_test_pred.csv", index=False)

RMSE of Fold 0: 0.7183604758781754
RMSE of Fold 1: 0.7152359642504859
RMSE of Fold 2: 0.7200741701835357
RMSE of Fold 3: 0.7142541505382889
RMSE of Fold 4: 0.7147885553324435


## Model-3

In [5]:
sample_submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
features_list = ["pred_1", "pred_2", "pred_3", "pred_4", "pred_5"]
testing_data = testing_data[features_list]

final_test_predictions = []  #list used to save final predictions of different folds.
final_valid_predictions = {}
for fold in range(5):
    #Splitting the training and validation data based on the folds
    x_train = training_data[training_data.kfold!=fold].reset_index(drop=True)
    x_valid = training_data[training_data.kfold==fold].reset_index(drop=True)
    y_train = x_train.target
    y_valid = x_valid.target
    
    valid_ids = x_valid.id.values.tolist()
    
    x_train = x_train[features_list]
    x_valid = x_valid[features_list]
    x_test = testing_data.copy()
    
    #Making the model
    model = GradientBoostingRegressor(n_estimators=500, max_depth=3)
    
    #Training the model
    model.fit(x_train, y_train)
    
    #Making predictions
    validation_prediction = model.predict(x_valid)
    final_valid_predictions.update(dict(zip(valid_ids, validation_prediction)))
    rmse = mean_squared_error(y_valid, validation_prediction, squared=False)
    print("RMSE of Fold "+ str(fold) + ":", rmse)
    
    #Saving test predictions
    test_preds = model.predict(x_test)
    final_test_predictions.append(test_preds)
    
#Saving validation predictions
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("model_3_train_pred.csv", index=False)

#Saving test predictions
sample_submission_data.target  = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission_data.columns = ["id", "pred_3"]
sample_submission_data.to_csv("model_3_test_pred.csv", index=False)

RMSE of Fold 0: 0.7186144051527593
RMSE of Fold 1: 0.7155345835036059
RMSE of Fold 2: 0.7201910247662089
RMSE of Fold 3: 0.7144954289273687
RMSE of Fold 4: 0.7151622638112822


# Making the final Model and making the submission file

## This model uses the data from the previous three models

In [6]:
training_data = pd.read_csv("../input/30-days-of-ml/train.csv")
testing_data = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

training_data["kfold"] = -1 
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for fold, (train_ind, valid_ind) in enumerate(kf.split(training_data)):
    training_data.loc[valid_ind, "kfold"] = fold

train_1 = pd.read_csv("./model_1_train_pred.csv")
train_2 = pd.read_csv("./model_2_train_pred.csv")
train_3 = pd.read_csv("./model_3_train_pred.csv")

test_1 = pd.read_csv("./model_1_test_pred.csv")
test_2 = pd.read_csv("./model_2_test_pred.csv")
test_3 = pd.read_csv("./model_3_test_pred.csv")

training_data = training_data.merge(train_1, on="id", how="left")
training_data = training_data.merge(train_2, on="id", how="left")
training_data = training_data.merge(train_3, on="id", how="left")

testing_data = testing_data.merge(test_1, on="id", how="left")
testing_data = testing_data.merge(test_2, on="id", how="left")
testing_data = testing_data.merge(test_3, on="id", how="left")

features_list = ["pred_1", "pred_2", "pred_3"]
testing_data = testing_data[features_list]
final_test_predictions = []  #list used to save final predictions of different folds.
for fold in range(5):
    #Splitting the training and validation data based on the folds
    x_train = training_data[training_data.kfold!=fold].reset_index(drop=True)
    x_valid = training_data[training_data.kfold==fold].reset_index(drop=True)
    y_train = x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[features_list]
    x_valid = x_valid[features_list]
    x_test = testing_data.copy()
    
    #Making the model
    model = LinearRegression()
    
    #Training the model
    model.fit(x_train, y_train)
    
    #Making predictions
    validation_prediction = model.predict(x_valid)
    rmse = mean_squared_error(y_valid, validation_prediction, squared=False)
    print("RMSE of Fold "+ str(fold) + ":", rmse)
    
    #Saving test predictions
    test_preds = model.predict(x_test)
    final_test_predictions.append(test_preds)
    
sample_submission_data.target  = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission_data.to_csv("submission.csv", index=False)




RMSE of Fold 0: 0.7180217434950062
RMSE of Fold 1: 0.7149820295930154
RMSE of Fold 2: 0.7196779426951816
RMSE of Fold 3: 0.7140102452949226
RMSE of Fold 4: 0.7145211811564163
