In [28]:
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
df = pd.read_csv(r"data\train.csv")
df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_12,x_13,x_14,x_15,x_16,x_17,x_18,x_19,x_20,y
0,-2.509198,9.014286,4.639879,1.97317,-6.879627,-6.88011,-8.838328,7.323523,2.0223,4.161452,...,6.648853,-5.753218,-6.363501,-6.33191,-3.915155,0.495129,-1.3611,-4.175417,2.237058,38.446314
1,-7.210123,-4.157107,-2.672763,-0.8786,5.703519,-6.006524,0.284689,1.848291,-9.070992,2.150897,...,8.977711,9.312641,6.167947,-3.907725,-8.046558,3.684661,-1.19695,-7.559235,-0.096462,12.794101
2,-9.31223,8.186408,-4.8244,3.250446,-3.765778,0.40136,0.934206,-6.302911,9.391693,5.502656,...,1.958,8.437485,-8.23015,-6.080343,-9.095454,-3.493393,-2.226454,-4.573019,6.57475,31.741684
3,-2.864933,-4.38131,0.853922,-7.181516,6.04394,-8.508987,9.737739,5.444895,-6.025686,-9.889558,...,4.580143,5.425407,-8.519107,-2.830685,-7.682619,7.262069,2.465963,-3.38204,-8.728833,6.389571
4,-3.780354,-3.496334,4.592124,2.751149,7.744255,-0.555701,-7.608115,4.264896,5.215701,1.225544,...,0.454657,-1.44918,-9.491617,-7.842171,-9.371416,2.728208,-3.71288,0.171414,8.151329,16.003422


In [11]:
df.shape

(10000, 22)

In [4]:
X = df.drop(columns = ["y",])
Y = df["y"]

In [8]:
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=23)

In [29]:
param_grid_RF = {
    'n_estimators': [40, 100, 300, 1000],
    'max_depth': [3, 5],
    'min_samples_split': [2, 4, 6]
}

model_RF = RandomForestRegressor()

grid_search_RF = GridSearchCV(model_RF, param_grid = param_grid_RF, cv = 4, scoring = "neg_mean_squared_error", n_jobs = -1)

grid_search_RF.fit(X_train, Y_train) 

best_param = grid_search_RF.best_params_

print("Best params:", best_param)

modelRF = grid_search_RF.best_estimator_

predictRF = modelRF.predict(x_test)

MAE = mean_absolute_error(y_test, predictRF)
R2 = r2_score(y_test, predictRF)

print(f"MAE: {MAE}")
print(f"R2: {R2}")

joblib.dump(modelRF, "model\modelRF")

Best params: {'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 300}
MAE: 14.72296739231603
R2: 0.28208535244022437


['model\\modelRF']

In [32]:
param_grid_XGB = {
    'n_estimators': [100, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1]
}

model_XGB = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    verbosity=0,
    random_state=42
)

grid_search_XGB = GridSearchCV(
    model_XGB,
    param_grid=param_grid_XGB,
    cv=4,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

grid_search_XGB.fit(X_train, Y_train)

best_param = grid_search_XGB.best_params_

print("Best params:", best_param)


modelXGB = grid_search_XGB.best_estimator_

predictXGB = modelRF.predict(x_test)

MAE = mean_absolute_error(y_test, predictXGB)
R2 = r2_score(y_test, predictXGB)

print(f"MAE: {MAE}")
print(f"R2: {R2}")

joblib.dump(modelXGB, "model\modelXGB")

Best params: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
MAE: 14.72296739231603
R2: 0.28208535244022437


['model\\modelXGB']

In [36]:
predict1_train = modelRF.predict(X_train)
predict2_train = modelXGB.predict(X_train)

X_meta_train = pd.DataFrame({
    "predict1": predict1_train,
    "predict2": predict2_train
})

meta_model = LinearRegression()
meta_model.fit(X_meta_train, Y_train)

predict1_test = modelRF.predict(x_test)
predict2_test = modelXGB.predict(x_test)

X_meta_test = pd.DataFrame({
    "predict1": predict1_test,
    "predict2": predict2_test
})

y_pred = meta_model.predict(X_meta_test)

MAE = mean_absolute_error(y_test, y_pred)
R2 = r2_score(y_test, y_pred)

print(f"MAE: {MAE:.5f}")
print(f"R2: {R2:.5f}")

MAE: 9.36001
R2: 0.72040
