In [1]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
%cd drive/MyDrive/kaggle/WiDS

%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import os
import sys
from datetime import datetime
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from statsmodels.tsa.arima.model import ARIMA
from scipy.stats import uniform, randint, iqr
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
import optuna

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.data_processing.reduce_mem import reduce_mem_usage
CFG = CFG()

In [4]:
BASE_PATH = os.getcwd()
DATA_PATH = os.path.join(BASE_PATH, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')

CFG.DATA_PATH = DATA_PATH
CFG.RAW_DATA_PATH = RAW_DATA_PATH
CFG.BASE_PATH = BASE_PATH

In [5]:
cc_train = pd.read_csv(os.path.join(CFG.RAW_DATA_PATH, 'train_data.csv'), parse_dates=["startdate"])
cc_test = pd.read_csv(os.path.join(CFG.RAW_DATA_PATH, 'test_data.csv'), parse_dates=["startdate"])
cc_sample = pd.read_csv(os.path.join(CFG.DATA_PATH, 'sample_solution.csv'))
target = 'contest-tmp2m-14d__tmp2m'

In [6]:
cc_train = reduce_mem_usage(cc_train)

In [7]:
train_df = cc_train.copy()
test_df = cc_test.copy()

In [8]:
def remove_outliers_tukey(data, alpha=1.5):
    '''
    Remove outliers using Tukey's method with the interquartile range (IQR).
    
    Parameters:
    data (numpy array or pandas dataframe): The data to remove outliers from.
    alpha (float): The sensitivity parameter, which determines the range to consider outliers.
                   A value of 1.5 is the default, which is a commonly used value.
    
    Returns:
    numpy array or pandas dataframe: The data with outliers removed.
    '''
    # Select only the numerical columns
    num_cols = data.select_dtypes(include=[np.number]).columns
    data_num = data[num_cols]
    
    # Compute the first and third quartiles
    q1, q3 = np.percentile(data_num, [25, 75])
    
    # Compute the interquartile range (IQR)
    iqr_val = iqr(data_num)
    
    # Compute the range outside of which data points are considered outliers
    outlier_range = (q1 - alpha * iqr_val, q3 + alpha * iqr_val)
    
    # Identify the outliers and remove them
    outliers = (data_num < outlier_range[0]) | (data_num > outlier_range[1])
    data_num_no_outliers = data_num[~outliers]
    
    # Merge the numerical columns back into the original data frame
    data_no_outliers = pd.concat([data_num_no_outliers, data.select_dtypes(exclude=[np.number])], axis=1)
    
    return data_no_outliers

data_with_removed_outliers = remove_outliers_tukey(train_df)
data_with_removed_outliers.shape

(375734, 246)

In [9]:
def target_var_visualized():
  plt.figure(figsize=(15,7))
  plt.subplot(121)
  sb.kdeplot(cc_train['contest-tmp2m-14d__tmp2m'], color = "#ffd514")
  plt.subplot(122)
  sb.boxplot(data=cc_train['contest-tmp2m-14d__tmp2m'], color = "#ff355d")
target_var_visualized()

In [10]:
def histogram_plot(data, label, title):
    sb.histplot(data, color='blue', label=label)
    plt.legend()
    plt.title(title)
    plt.show()
histogram_plot(data=cc_train['contest-tmp2m-14d__tmp2m'], label="contest-tmp2m-14d__tmp2m", title="Target Variable distribution")

In [11]:
def train_test_dist(train, test):
    fig, ax = plt.subplots(figsize = (10, 5))
    sb.kdeplot(data=train,  color='blue', fill=True, ax=ax, label="Train Data")
    sb.kdeplot(data=test, color='orange', fill=True, ax=ax, label="Test Data")
    plt.legend()
    plt.show()
    
#train_target = cc_train['contest-tmp2m-14d__tmp2m']
#test_target = cc_test['contest-tmp2m-14d__tmp2m']
train_target = cc_train['nmme0-tmp2m-34w__nmme0mean']
test_target = cc_test['nmme0-tmp2m-34w__nmme0mean']
train_test_dist(train_target, test_target)

In [12]:
def location_feature(train, test):
    # Reference: https://www.kaggle.com/code/flaviafelicioni/wids-2023-different-locations-train-test-solved
    scale = 14
    train.loc[:,'lat']=round(train.lat,scale)
    train.loc[:,'lon']=round(train.lon,scale)
    test.loc[:,'lat']=round(test.lat,scale)
    test.loc[:,'lon']=round(test.lon,scale)
    
    train_and_test = pd.concat([train, test], axis=0)
    train_and_test['loc_group'] = train_and_test.groupby(['lat', 'lon']).ngroup()
    print(f'{train_and_test.loc_group.nunique()} unique locations')
    
    train = train_and_test.iloc[:len(train)]
    test = train_and_test.iloc[len(train):].drop(target, axis=1)
    
    return train, test

def cat_encode(train, test):
    # encoding the categorical feature in the train and test data set
    # using OneHotEncoder
    le = LabelEncoder()
    train['climateregions__climateregion'] = le.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
    
    # train = train.drop(['climateregions__climateregion'], axis=1)
    # test = test.drop(['climateregions__climateregion'], axis=1)
    
    # train_encoded = pd.DataFrame(train_encoded.toarray(), columns=ohe.get_feature_names_out(['climateregions__climateregion']))
    # test_encoded = pd.DataFrame(test_encoded.toarray(), columns=ohe.get_feature_names_out(['climateregions__climateregion']))
    
    # train = pd.concat([train, train_encoded], axis=1)
    # test = pd.concat([test, test_encoded], axis=1)
    
    return train, test

def fill_na_rows(dataset):
    # Find the columns with missing values
    columns_with_missing_values = dataset.columns[dataset.isnull().any()].tolist()
    
    # Impute the missing values with the mean value of that column
    for col in columns_with_missing_values:
        dataset[col].fillna(dataset[col].mean(), inplace=True)
        
    return dataset

def create_new_feat(dataset):
    dataset['year']=dataset['startdate'].dt.year 
    dataset['month']=dataset['startdate'].dt.month 
    dataset['day']=dataset['startdate'].dt.dayofyear
    return dataset

def feature_engineering(origin_train, origin_test):
    train, test = origin_train, origin_test
    train = fill_na_rows(train)
    train = create_new_feat(train)
    test = create_new_feat(test)
    train, test = cat_encode(train, test)
    irrelevant_cols = ['index', 'startdate','contest-tmp2m-14d__tmp2m', 'climateregions__climateregion']
    features = [col for col in train.columns if col not in irrelevant_cols]
    X = train[features]
    X_test = test[features]
    y = train['contest-tmp2m-14d__tmp2m']
    
    return X, y, X_test

In [13]:
X, y, X_test = feature_engineering(cc_train.copy(), cc_test.copy())

In [14]:
## Identify correlated features to drop that fall above a correlation threshold 
## https://goodboychan.github.io/python/datacamp/machine_learning/2020/07/08/02-Feature-selection-I-selecting-for-feature-information.html 

def identify_correlated(df, threshold):
    corr_matrix = df.corr().abs()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    reduced_corr_matrix = corr_matrix.mask(mask)
    features_to_drop = [c for c in reduced_corr_matrix.columns if any(reduced_corr_matrix[c] > threshold)]
    return features_to_drop

features_to_drop = identify_correlated(cc_train, .80)

In [15]:
print(len(features_to_drop))
print(features_to_drop)

In [16]:
remove_feature = ['index', 'contest-tmp2m-14d__tmp2m']
features_to_drop_v1 = [ele for ele in features_to_drop if ele not in remove_feature]
features_to_drop_v1

['contest-pevpr-sfc-gauss-14d__pevpr',
 'nmme0-tmp2m-34w__cancm30',
 'nmme0-tmp2m-34w__cancm40',
 'nmme0-tmp2m-34w__ccsm30',
 'nmme0-tmp2m-34w__ccsm40',
 'nmme0-tmp2m-34w__cfsv20',
 'nmme0-tmp2m-34w__gfdlflora0',
 'nmme0-tmp2m-34w__gfdlflorb0',
 'nmme0-tmp2m-34w__gfdl0',
 'nmme0-tmp2m-34w__nasa0',
 'nmme0-tmp2m-34w__nmme0mean',
 'contest-wind-h10-14d__wind-hgt-10',
 'nmme-tmp2m-56w__cancm3',
 'nmme-tmp2m-56w__cancm4',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-tmp2m-56w__ccsm4',
 'nmme-tmp2m-56w__cfsv2',
 'nmme-tmp2m-56w__gfdl',
 'nmme-tmp2m-56w__gfdlflora',
 'nmme-tmp2m-56w__gfdlflorb',
 'nmme-tmp2m-56w__nasa',
 'nmme-tmp2m-56w__nmmemean',
 'nmme-prate-34w__cancm3',
 'nmme-prate-34w__cancm4',
 'nmme-prate-34w__ccsm3',
 'nmme-prate-34w__ccsm4',
 'nmme-prate-34w__cfsv2',
 'nmme-prate-34w__gfdl',
 'nmme-prate-34w__gfdlflora',
 'nmme-prate-34w__gfdlflorb',
 'nmme-prate-34w__nasa',
 'nmme-prate-34w__nmmemean',
 'contest-wind-h100-14d__wind-hgt-100',
 'nmme0-prate-56w__cancm30',
 'nmme0-prate-56w__ca

In [17]:
cc_train_reduced = pd.DataFrame(X.drop(features_to_drop_v1, axis=1))
cc_test_reduced = pd.DataFrame(X_test.drop(features_to_drop_v1, axis=1))
print("Dropped features that are highly correlated")

In [18]:
X_train, X_test_tts, y_train, y_test = train_test_split(cc_train_reduced, y, test_size=0.33, random_state=42)
print("Split the dataset for training successfully")

In [19]:
cc_test_copy = cc_test.copy()

In [20]:
from catboost import CatBoostRegressor
params = {'iterations': 15000,
          'learning_rate': 0.01,
          'depth': 6,
          'l2_leaf_reg': 3,
          'bagging_temperature': 1,
          'border_count': 256,
          'loss_function': 'RMSE',
          'random_seed': None,
          'task_type': 'GPU',
          'verbose': 100}
# Define the CatBoostRegressor object
reg_catboost = CatBoostRegressor(**params)

# Fit the model to the training data
reg_catboost.fit(X_train, y_train, eval_set=(X_train, y_train))

# Generate predictions on the test data
y_pred_catboost = reg_catboost.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_catboost))
print("RMSE:", rmse)

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_catb = reg_catboost.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_catb
cc_test_copy_v2[[target, "index"]].to_csv("catboost_predictions.csv", index=False)

In [21]:
print("Beginning training and fitting lightgbm model")
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'max_depth': 8,
    'learning_rate': 0.03,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'min_data_in_leaf': 100,
    'subsample_for_bin': 200000,
    'n_estimators': 15000,
    'early_stopping_rounds': 50,
    'device_type': 'gpu'
}

# Create the LightGBM model object
reg_lgb = lgb.LGBMRegressor(**params)

# Fit the model to the training data
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])


# Get feature importances
importance_scores = reg_lgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})

# Sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Output feature importances to a CSV file
feature_importances.to_csv("lgbm_feature_importances.csv", index=False)

# Generate predictions on the test data
y_pred_lgb = reg_lgb.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions.csv", index=False)
print("Finished training and fitting lightgbm model, created lgbpredictions.csv and feature_importances.csv")

In [22]:
def objective(trial):
    gc.collect()
    params = {
        'boosting_type': 'gbdt', 
        'objective': 'regression', 
        'metric': 'rmse', 
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'device_type':'gpu',
        'verbosity': -1,
        'n_estimators': 6000
    }

    reg_lgb = lgb.LGBMRegressor(**params)

    reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])

    y_pred_lgb = reg_lgb.predict(X_test_tts)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))

    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=15)

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial

print(f'  Value: {trial.value:.5f}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')    

best_params = trial.params
reg_lgb = lgb.LGBMRegressor(**best_params)
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])
y_pred_lgb = reg_lgb.predict(X_test_tts)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions.csv", index=False)

In [23]:
# create a DMatrix from the training data
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

# set up parameters for XGBoost
# list of learning_rates to test [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
# n_estimators = [50, 100, 150, 200]
# max_depth = [2, 4, 6, 8]
# param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)
print("Training and predicting using xgboost")
# Define the search space for the hyperparameters
params = {'base_score': 0.5, 
          'booster': 'gbtree',
          'tree_method': 'gpu_hist',
          'n_estimators': 15000,
          'objective': 'reg:squarederror',
          'max_depth': 6,
          'subsample': 0.5,
          'colsample_bytree': 0.5,
          'gamma': 1.4,
          'min_child_weight': 7,
          'learning_rate': 0.01,
          'gpu_id': 0}

reg_xgb = xgb.XGBRegressor(**params)

reg_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)], verbose=1000)

# get the feature importance scores
importance_scores = reg_xgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})
feature_importances.to_csv("xgboostbestparameters.csv")
# sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)
print(feature_importances)


# make predictions on the test data
y_pred_xgb = reg_xgb.predict(X_test_tts)

# calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print("RMSE:", rmse)


cc_test_pred = reg_xgb.predict(cc_test_reduced)
cc_test_copy[target] = cc_test_pred
cc_test_copy[[target,"index"]].to_csv("xgbpredictions.csv",index = False)
print("Finished training and fitting, created xgbpredictions,csv")

In [24]:
print("Beginning training and fitting lightgbm model")
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'max_depth': 4,
    'learning_rate': 0.03,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'min_data_in_leaf': 100,
    'subsample_for_bin': 200000,
    'n_estimators': 15000,
    'early_stopping_rounds': 50,
    'device_type': 'gpu'
}

# Create the LightGBM model object
reg_lgb = lgb.LGBMRegressor(**params)

# Fit the model to the training data
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])


# Get feature importances
importance_scores = reg_lgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})

# Sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Output feature importances to a CSV file
feature_importances.to_csv("lgbm_feature_importances2.csv", index=False)

# Generate predictions on the test data
y_pred_lgb = reg_lgb.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions2.csv", index=False)
print("Finished training and fitting lightgbm model, created lgbpredictions.csv and feature_importances.csv")

In [25]:
print("Beginning training and fitting lightgbm model")
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 50,
    'max_depth': 8,
    'learning_rate': 0.03,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'min_data_in_leaf': 100,
    'subsample_for_bin': 200000,
    'n_estimators': 15000,
    'early_stopping_rounds': 50,
    'device_type': 'gpu'
}

# Create the LightGBM model object
reg_lgb = lgb.LGBMRegressor(**params)

# Fit the model to the training data
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])


# Get feature importances
importance_scores = reg_lgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})

# Sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Output feature importances to a CSV file
feature_importances.to_csv("lgbm_feature_importances2.csv", index=False)

# Generate predictions on the test data
y_pred_lgb = reg_lgb.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions2.csv", index=False)
print("Finished training and fitting lightgbm model, created lgbpredictions.csv and feature_importances.csv")

In [26]:
print("Beginning training and fitting lightgbm model")
fixed_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'device_type': 'gpu'
}
params = {
    'num_leaves': 31,
    'max_depth': 8,
    'learning_rate': 0.03,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'min_data_in_leaf': 100,
    'subsample_for_bin': 200000,
    'n_estimators': 15000,
    'early_stopping_rounds': 50,
    
}

# Create the LightGBM model object
reg_lgb = lgb.LGBMRegressor(**params, **fixed_params)

# Fit the model to the training data
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])


# Get feature importances
importance_scores = reg_lgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})

# Sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Output feature importances to a CSV file
feature_importances.to_csv("lgbm_feature_importances2.csv", index=False)

# Generate predictions on the test data
y_pred_lgb = reg_lgb.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions2.csv", index=False)
print("Finished training and fitting lightgbm model, created lgbpredictions.csv and feature_importances.csv")

In [27]:
print("Beginning training and fitting lightgbm model")
fixed_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'device_type': 'gpu'
}
params = {
    'num_leaves': 31,
    'max_depth': 10,
    'learning_rate': 0.03,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'min_data_in_leaf': 100,
    'subsample_for_bin': 200000,
    'n_estimators': 15000,
    'early_stopping_rounds': 50,
    
}

# Create the LightGBM model object
reg_lgb = lgb.LGBMRegressor(**params, **fixed_params)

# Fit the model to the training data
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])


# Get feature importances
importance_scores = reg_lgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})

# Sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Output feature importances to a CSV file
feature_importances.to_csv("lgbm_feature_importances2.csv", index=False)

# Generate predictions on the test data
y_pred_lgb = reg_lgb.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions2.csv", index=False)
print("Finished training and fitting lightgbm model, created lgbpredictions.csv and feature_importances.csv")

In [28]:
datetime.now()

datetime.datetime(2023, 2, 23, 10, 19, 52, 705252)

In [29]:
reg_lgb.evals_result_

{'training': OrderedDict([('rmse',
               [9.604810165663874,
                9.348818281158453,
                9.09959901987316,
                8.858232971804432,
                8.62458358378795,
                8.399439000403053,
                8.180181285331042,
                7.966389953510873,
                7.760146399471705,
                7.560541834482607,
                7.367270968455104,
                7.178486266811673,
                6.99625588410554,
                6.8195159947513275,
                6.6476686215295215,
                6.480450410605093,
                6.320218415984396,
                6.162644337605795,
                6.012316055024826,
                5.864671166510509,
                5.722343163619358,
                5.585110811303676,
                5.451483928416136,
                5.321409962711398,
                5.196503708571715,
                5.074055228666145,
                4.956273236783989,
                4.841

In [30]:
reg_lgb.evals_result_.keys()

dict_keys(['training', 'valid_1'])

In [31]:
history = reg_lgb.evals_result_
history['training']['rmse']

[9.604810165663874,
 9.348818281158453,
 9.09959901987316,
 8.858232971804432,
 8.62458358378795,
 8.399439000403053,
 8.180181285331042,
 7.966389953510873,
 7.760146399471705,
 7.560541834482607,
 7.367270968455104,
 7.178486266811673,
 6.99625588410554,
 6.8195159947513275,
 6.6476686215295215,
 6.480450410605093,
 6.320218415984396,
 6.162644337605795,
 6.012316055024826,
 5.864671166510509,
 5.722343163619358,
 5.585110811303676,
 5.451483928416136,
 5.321409962711398,
 5.196503708571715,
 5.074055228666145,
 4.956273236783989,
 4.84177345261025,
 4.732141488473863,
 4.6249286813562165,
 4.522676697679745,
 4.424038732589133,
 4.325781131974011,
 4.231393074964938,
 4.140896019222796,
 4.053213378264655,
 3.9673053166684737,
 3.8849845562597545,
 3.805016024414378,
 3.7275601683204953,
 3.650946820953754,
 3.5791094870786555,
 3.5096958952178174,
 3.4415390699823534,
 3.376758392221133,
 3.3138377862314092,
 3.2524901474747367,
 3.193353399633038,
 3.1365680129637203,
 3.081124463

In [32]:
history = reg_lgb.evals_result_
min(history['training']['rmse'])

0.20449945480130485

In [33]:
history = reg_lgb.evals_result_
history['training']['rmse'][-1]

0.20449945480130485

In [34]:
history['valid 1']['rmse'][-1]

In [35]:
history.keys()

dict_keys(['training', 'valid_1'])

In [36]:
history['valid_1']['rmse'][-1]

0.2837496850280117

In [37]:
import wandb
wandb.login()
print("Beginning training and fitting lightgbm model")
fixed_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'device_type': 'gpu'
}
params = {
    'num_leaves': 31,
    'max_depth': 9,
    'learning_rate': 0.03,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 50,
    'min_data_in_leaf': 100,
    'subsample_for_bin': 200000,
    'n_estimators': 15000,
    'early_stopping_rounds': 50,
}

wandb.init(
    project="WiDS",
    config=params,
    name=f"{datetime.now()}"
)

# Create the LightGBM model object
reg_lgb = lgb.LGBMRegressor(**params, **fixed_params)

# Fit the model to the training data
reg_lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test_tts, y_test)])


# Get feature importances
importance_scores = reg_lgb.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importance_scores})

# Sort the features by importance score
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Output feature importances to a CSV file
feature_importances.to_csv("lgbm_feature_importances2.csv", index=False)

# Generate predictions on the test data
y_pred_lgb = reg_lgb.predict(X_test_tts)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print("RMSE:", rmse)

history = reg_lgb.evals_result_
train_rmse = history['training']['rmse'][-1]
val_rmse = history['valid_1']['rmse'][-1]
wandb.log({"train rmse": train_rmse, "val rmse": val_rmse})

# Make predictions on the competition test data
cc_test_copy_v2 = cc_test_copy.copy()
cc_test_pred_lgb = reg_lgb.predict(cc_test_reduced)
cc_test_copy_v2[target] = cc_test_pred_lgb
cc_test_copy_v2[[target, "index"]].to_csv("lgbpredictions2.csv", index=False)
print("Finished training and fitting lightgbm model, created lgbpredictions.csv and feature_importances.csv")

In [38]:
wandb.finish()