# Stacking Ensemble: 30 Days of ML
**Importing libraries:**

In [1]:
#pip install scikit-learn==0.24

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))


scikit-learn==0.23.2
pandas==1.2.4
numpy==1.19.5


**Getting Pretrained Base Learners:**
> The base learners are XGBoost and LightGBM models optimized with Optuna.

In [3]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv("../input/30dml-stacking-5cv/base_train_1.csv")
df2 = pd.read_csv("../input/30dml-stacking-5cv/base_train_2.csv")
df3 = pd.read_csv("../input/30dml-stacking-5cv/base_train_3.csv")

df_test1 = pd.read_csv("../input/30dml-stacking-5cv/base_test_1.csv")
df_test2 = pd.read_csv("../input/30dml-stacking-5cv/base_test_2.csv")
df_test3 = pd.read_csv("../input/30dml-stacking-5cv/base_test_3.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

**Training the Meta Linear Regression model:**

In [4]:
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    #model = LinearRegression(fit_intercept=False)
    model= Lasso(alpha=0.0001,precompute=True,max_iter=10000,
            positive=True, random_state=999, fit_intercept=True)
    model.fit(xtrain, ytrain)
    
    print(model.coef_)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

[0.9843047 0.025773  0.       ]
0 0.7154442234330887
[1.00461572 0.00341021 0.        ]
1 0.715566707590499
[0.99557115 0.01427209 0.        ]
2 0.717505282118116
[0.94744891 0.06450304 0.        ]
3 0.7174257732110915
[0.996319 0.       0.      ]
4 0.715414742797615
0.716271345830082 0.0009767002205870472


In [5]:
#0.7163095767796752

**Submitting to the leaderboard:**

In [6]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)