In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns

from lightgbm import LGBMRegressor
import lightgbm as lgb

In [2]:
train_features = pd.read_csv('../data/train_features.csv').drop(['Unnamed: 0'], axis=1)

In [3]:
train_features = train_features.drop(['uid','date','split', 'region', 'latitude', 'longitude'], axis=1)

In [4]:
X = train_features.drop(['density'], axis=1)
Y = train_features['density']

In [5]:
features_name = np.array(X.columns)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [8]:
gb = GradientBoostingRegressor()
gb_param_grid = {
    'n_estimators': [50,100,500,1000,1500,3000],
    'learning_rate': [0.05, 0.1, 0.5, 1],
    'max_features': [2,3,4,5,6,7],
    'min_samples_split': [2,3,5,10,15,20],
    'loss': ['absolute_error', 'squared_error', 'huber', 'quantile']}

gb_grid_search = GridSearchCV(estimator=gb,param_grid=gb_param_grid, n_jobs=-1, verbose=1)

In [9]:
gb_grid_search.fit(X_train, y_train)

gb_grid_search.best_params_

Fitting 5 folds for each of 3456 candidates, totalling 17280 fits


{'learning_rate': 0.05,
 'loss': 'squared_error',
 'max_features': 7,
 'min_samples_split': 5,
 'n_estimators': 3000}

In [10]:
y_pred = gb_grid_search.best_estimator_.predict(X_test)

In [11]:
metrics.root_mean_squared_error(y_test, y_pred)

1680615.0005856536

In [12]:
y_pred = gb_grid_search.best_estimator_.predict(X_train)

In [13]:
metrics.root_mean_squared_error(y_train, y_pred)

322781.0824102935

In [14]:
rf = RandomForestRegressor()

In [15]:
rf_params = {
    'n_estimators': [50,100,200,500],
    'max_depth': [3,4,5,6,7],
    'min_samples_split': [2,3,5,10,15,20],
    'criterion' :['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

In [16]:
grid = GridSearchCV(estimator=rf,param_grid=rf_params, n_jobs=-1)

In [17]:
grid.fit(X_train, y_train)

grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': 7,
 'min_samples_split': 5,
 'n_estimators': 50}

In [18]:

grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': 7,
 'min_samples_split': 5,
 'n_estimators': 50}

In [22]:
y_pred = grid.best_estimator_.predict(X_test)

In [23]:
metrics.root_mean_squared_error(y_test, y_pred)

1645378.4738400155

In [26]:
rn = RandomForestRegressor(
    criterion='absolute_error',
    max_depth=7,
    min_samples_split=5,
    n_estimators=50,
    n_jobs=-1
)

In [27]:
rn.fit(X_train, y_train)

In [28]:
y_pred = rn.predict(X_test)

In [29]:
metrics.root_mean_squared_error(y_test, y_pred)

1773865.6596315484

In [31]:
train_features = pd.read_csv('../data/train_features.csv').drop(['Unnamed: 0'], axis=1)

In [43]:
X = train_features
Y = train_features

X['month'] = pd.DatetimeIndex(X.date).month
X['year'] = pd.DatetimeIndex(X.date).year

Y['month'] = pd.DatetimeIndex(Y.date).month
Y['year'] = pd.DatetimeIndex(Y.date).year

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [30]:
lgb_params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'learning_rate': 0.005,
          'bagging_fraction': 0.3,
          'feature_fraction': 0.3,
          'min_split_gain': 0.1,
          'verbosity': -1,
          'data_random_seed': 2023
}

In [41]:
import os
from sklearn.cluster import KMeans
from sklearn.model_selection import GroupKFold

In [51]:
%%time
if not os.path.exists('../outputs/weights'):
    os.makedirs('../outputs/weights')

test_pred_list = []
rmses = []
models = []
for i in range(100):
    # northeast', 'midwest
    train = X_train[X_train.region.isin(['northeast', 'midwest'])].copy()
    km = KMeans(n_clusters=100)
    train['cluster'] = km.fit_predict(train[['longitude', 'latitude']].values).astype(str)
    train['fold'] = -1
    gkf = GroupKFold(n_splits=5)
    for idx, (trn, val) in enumerate(gkf.split(train, groups=train.cluster)):
        train.iloc[val, -1] = idx
    verbose_eval = 1000
    num_rounds = 30000
    early_stop = 500
    test_preds = []

    oofs = []
    for f in range(5):
        trn_data = train[(train.fold != f)].drop(
            labels=['latitude', 'longitude', 'year', 'cluster', 'fold', 'uid', 'date', 'split', 'severity', 'density'], axis=1).copy()
        trn_label = train[(train.fold != f)].severity
        val_data = train[(train.fold == f)].drop(
            labels=['latitude', 'longitude', 'year', 'cluster', 'fold', 'uid', 'date', 'split', 'density'], axis=1).copy()
        test_data = X_test.drop(['latitude', 'longitude', 'year', 'uid', 'date', 'split', 'severity'], axis=1).copy()

        trn_data['region'] = trn_data['region'].map({
            'midwest': 0,
            'south': 1,
            'northeast': 2,
            'west': 3
        })

        val_data['region'] = val_data['region'].map({
            'midwest': 0,
            'south': 1,
            'northeast': 2,
            'west': 3
        })

        test_data['region'] = test_data['region'].map({
            'midwest': 0,
            'south': 1,
            'northeast': 2,
            'west': 3
        })


        d_train = lgb.Dataset(trn_data, label=trn_label.values, categorical_feature=['region'])
        d_valid = lgb.Dataset(val_data.drop(labels='severity', axis=1),
                              label=val_data.severity, categorical_feature=['region'])


        model = lgb.train(lgb_params, d_train, num_boost_round=num_rounds, valid_sets=d_valid)

        val_pred = model.predict(val_data.drop(labels='severity', axis=1))
        val_data['pred'] = np.round(val_pred).astype(np.int64)
    #     val_data['pred'] = [pseudo_round(x) for x in val_pred]
        val_data['raw_pred'] = val_pred

        #test_pred = model.predict(test_data)
        #test_preds.append(test_pred)
        oofs.append(val_data)
        
        model.save_model(f'../outputs/weights/model_i{i}_f{f}.bin')

    oof = pd.concat(oofs)
    rmses.append(np.sqrt(metrics.mean_squared_error(oof.severity, oof.pred)))
    #test_pred_list.append(test_preds)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20

CPU times: user 18h 23min 56s, sys: 1min 14s, total: 18h 25min 10s
Wall time: 58min 31s


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [52]:
rmses

[0.9151190755166396,
 0.9494050047338589,
 0.9397366773894963,
 0.9479610419138021,
 0.9353533261548057,
 0.9171128013559504,
 0.922573331779873,
 0.9200952898834737,
 0.9136209259572768,
 0.9000253674760821,
 0.9176105559659236,
 0.9035697848890165,
 0.9195988801953932,
 0.9265244133774754,
 0.9230681418619906,
 0.9280017320090908,
 0.941193257975453,
 0.9171128013559504,
 0.9131209965485952,
 0.9195988801953932,
 0.9637256473123827,
 0.9343764555737658,
 0.9532448788673129,
 0.9250447354355775,
 0.9314396967468151,
 0.9377910507114687,
 0.9289853059279986,
 0.9260314501005581,
 0.9377910507114687,
 0.896466936387801,
 0.9363291775690444,
 0.9333985626260246,
 0.9191022023950811,
 0.9146199649938201,
 0.9191022023950811,
 0.9151190755166396,
 0.9270171145095474,
 0.9255382242599222,
 0.9171128013559504,
 0.9131209965485952,
 0.9126207932809591,
 0.9146199649938201,
 0.9230681418619906,
 0.9176105559659236,
 0.9551590271380404,
 0.9035697848890165,
 0.9191022023950811,
 0.9195988801953

In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [58]:
%%time
model = GradientBoostingRegressor(
    learning_rate=0.05,
    loss='squared_error',
    max_features=7,
    min_samples_split=5,
    n_estimators=3000
)

cv = RepeatedStratifiedKFold(
    n_splits=10,
    n_repeats=100,
    random_state=1
)

x = X.drop(['latitude', 'longitude', 'year', 'uid', 'date', 'split', 'severity', 'density'], axis=1)
y = Y.severity

x['region'] = x['region'].map({
            'midwest': 0,
            'south': 1,
            'northeast': 2,
            'west': 3
        })

n_scores = cross_val_score(model, x, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

CPU times: user 5.09 s, sys: 2.16 s, total: 7.26 s
Wall time: 27min 34s


In [59]:
np.mean(n_scores)

-0.6799767865973736

In [64]:
model = GradientBoostingRegressor(
    learning_rate=0.05,
    loss='squared_error',
    max_features=7,
    min_samples_split=5,
    n_estimators=3000
)
model.fit(x,y)

In [65]:
pred = model.predict(x)

metrics.root_mean_squared_error(pred, y)

0.42950789566911907