#  Necesssary libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Data

In [None]:
data_train = pd.read_csv('/content/train.csv')
data_test = pd.read_csv('/content/test.csv')

In [None]:
X = data_train.drop(['id', 'MedHouseVal'], axis=1)
y = data_train['MedHouseVal']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Training dataset

In [None]:
lgbm = LGBMRegressor()

lgbm.fit(X_train, y_train)
lgbm_predictions = lgbm.predict(X_test)
mse_lgbm = mean_squared_error(y_test, lgbm_predictions)
rmse_lgbm = np.sqrt(mse_lgbm)

print("MSE LightGBM:", mse_lgbm)
print("MSE LightGBM:", rmse_lgbm)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 29709, number of used features: 8
[LightGBM] [Info] Start training from score 2.077071
MSE LightGBM: 0.31132178578218883
MSE LightGBM: 0.5579621723577584


In [None]:
lgbm = LGBMRegressor()

def run_kfold(RFR):
    kf = KFold(n_splits=300)
    rmse_scores = []
    fold = 0

    for train_index, test_index in kf.split(X):
        fold += 1
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        lgbm.fit(X_train, y_train)
        lgbm_predictions = lgbm.predict(X_test)
        mse_lgbm = mean_squared_error(y_test, lgbm_predictions)
        rmse_lgbm = sqrt(mse_lgbm)

        rmse_scores.append(rmse_lgbm)
        print("Fold {0} accuracy: {1}".format(fold, rmse_lgbm))


    average_rmse = np.mean(rmse_scores)
    print(f'Average Mean Squared Error {average_rmse}')

run_kfold(lgbm)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 37013, number of used features: 8
[LightGBM] [Info] Start training from score 2.079828
Fold 1 accuracy: 0.5666604196915134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002847 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 37013, number of used features: 8
[LightGBM] [Info] Start training from score 2.080361
Fold 2 accuracy: 0.4810384607874187
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the tra

In [None]:
ids = data_test['id']
predictions = lgbm.predict(data_test.drop(['id'], axis=1))

output = pd.DataFrame({'id' : ids, 'MedHouseVal': predictions})
output.to_csv('predictions.csv', index = False)
output.head()


Unnamed: 0,id,MedHouseVal
0,37137,0.708973
1,37138,1.009468
2,37139,3.998536
3,37140,3.40944
4,37141,2.494327
