In [None]:
# import required packages
import pandas as pd
import math
import sklearn.metrics
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import matplotlib.dates as md
import seaborn as sns
import numpy as np
import xgboost as xgb
from category_encoders.target_encoder import TargetEncoder


In [None]:
train_df = pd.read_csv('train_data.csv', parse_dates= ['startdate']) 
test_df = pd.read_csv('test_data.csv',  parse_dates= ['startdate']) 

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
# Check for NaN values in the dataframe
nan_cols_train = train_df.isna().any()

# Count the number of NaN values in each column that contains NaN values
nan_count_train = train_df.isna().sum()[nan_cols_train]

print(nan_count_train)

In [None]:
nan_cols_test = test_df.isna().any()

# Count the number of NaN values in each column that contains NaN values
nan_count_test = test_df.isna().sum()[nan_cols_test]

print(nan_count_test)

In [None]:
nan_cols_list = nan_count_train.index.tolist()
nan_cols_list

In [None]:
# fill tmp columns' nans with the value of the column nmme-tmp2m-34w__cfsv2 as it has the best rmse (from a calculation done further below)
# for prate columns just use the mean of the column

tmp_columns = ['nmme0-tmp2m-34w__ccsm30', 'nmme-tmp2m-56w__ccsm3', 'nmme-tmp2m-34w__ccsm3', 'ccsm30']
prate_columns = ['nmme-prate-34w__ccsm3','nmme0-prate-56w__ccsm30','nmme0-prate-34w__ccsm30','nmme-prate-56w__ccsm3']

for column in train_df.columns:
    if column in tmp_columns:
        train_df[column] = train_df[column].fillna(value=train_df['nmme-tmp2m-34w__cfsv2'])
    if column in prate_columns:
        # calculate the mean value of the column
        mean_value = train_df[column].mean()

        # fill missing values with the mean
        train_df[column].fillna(value=mean_value, inplace=True)

In [None]:
# Let's check that there are no nans left

# Check for NaN values in the dataframe
nan_cols_train = train_df.isna().any()

# Count the number of NaN values in each column that contains NaN values
nan_count_train = train_df.isna().sum()[nan_cols_train]

print(nan_count_train)

In [None]:
# renaming some more frequently used columns 

train_df = train_df.rename(columns={'climateregions__climateregion': 'region', 'contest-tmp2m-14d__tmp2m': 'target_tmp'})
test_df = test_df.rename(columns={'climateregions__climateregion': 'region', 'contest-tmp2m-14d__tmp2m': 'target_tmp'})

#### Merge lon and lat

If we merge the lon and lat columns we get each unique region

In [None]:
# clean up lon and lat based on Kaggle notebook: https://www.kaggle.com/code/flaviafelicioni/wids-2023-different-locations-train-test-solved#Solution

scale = 14

train_df.loc[:,'lat']=round(train_df.lat,scale)
train_df.loc[:,'lon']=round(train_df.lon,scale)

test_df.loc[:,'lat']=round(test_df.lat,scale)
test_df.loc[:,'lon']=round(test_df.lon,scale)

In [None]:
# Select the columns to merge
columns_to_merge = ['lon', 'lat']

# create a new column with the merged values
train_df['lon_lat'] = train_df[columns_to_merge]\
    .apply(lambda x: '_'.join(x.astype(str)), axis=1)

test_df['lon_lat'] = test_df[columns_to_merge]\
    .apply(lambda x: '_'.join(x.astype(str)), axis=1)

In [None]:
# check how many unique values each column has

unique_counts_df = pd.DataFrame(train_df.apply(lambda x: x.nunique()), columns=['Unique Values']).T
unique_counts_df.head()

#### Calculate moving averages for daily features

In [None]:
# # change daily values to a moving average calculated the same way as target_tmp (avg of min and max value over 14 days)

# daily_measures_list = train_df.filter(regex='mjo1d__amplitude|^sst|^wind').columns.tolist()


# def create_moving_averages(df):
#     for column in df.columns:
#         if column in daily_measures_list:
#             new_col_name = column+'_avg_14d'
#             # calculate the rolling average for 14 days, NOTE: how to do min max instead?
#             df[new_col_name] = df.groupby('region')[column].transform(lambda x: x.rolling(14, 1).mean())
#     df.drop(daily_measures_list, axis=1, inplace=True)        

# create_moving_averages(train_df)
# create_moving_averages(test_df)


In [None]:
# train_df['lon_lat'].unique().tolist()[0:15]

In [None]:
# example_df = train_df[(train_df['startdate']< '2015-10-01') & ((train_df['lon_lat']== '0.83333333333333_0.0') | (train_df['lon_lat']== '0.83333333333333_0.04545454545455'))]

# example_df = example_df[['startdate','lon_lat', 'wind-vwnd-925-2010-11_avg_14d']]

# example_df.tail(50)

In [None]:
# forecasts_34_56_list = train_df.filter(regex='34|56').columns.tolist()
# forecasts_34_56_list

### EDA

The following graphs will be created to get a better understanding of the data:

* The average target_tmp for each region over time
* The variance in temperature per region on a specific date
* Each temperature forecast vs target temperature

In [None]:
# Group the dataframe by the 'region' and 'startdate' columns
avg_tmp_df = train_df.groupby(['region', 'startdate'])['target_tmp'].mean().reset_index()

# Rename the columns
avg_tmp_df = avg_tmp_df.rename(columns={'target_tmp':'avg_tmp'})

avg_tmp_df.info()

In [None]:
# prepare the figure
fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(15, 10))

# set the regions
regions = avg_tmp_df['region'].unique()

# iterate through the regions
for i, region in enumerate(regions):
    ax = axs[i//3, i%3]
    data = avg_tmp_df[avg_tmp_df['region'] == region]
    sns.lineplot(ax = ax, x='startdate', y='avg_tmp', data=data)
    ax.set_title(region)
    ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday = 1, interval = 12))
    ax.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d'))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation = 45)
    ax.xaxis.set_minor_locator(md.DayLocator(interval = 1))
    ax.tick_params(axis = 'x', which = 'major', length = 10)
    ax.tick_params(axis = 'x', which = 'minor', length = 5)
    plt.xlabel('Date')
    plt.ylabel('Temperature')

# adjust the layout
fig.tight_layout()

# display the plot
plt.show()

#### Checking how much the temperature varies within each location of a region

In [None]:
# group the dataframe by the 'region' column
grouped = train_df.groupby('region')

# calculate the mean and standard deviation for the 'average_tmp' column for each group (region)
mean_temp = grouped['target_tmp'].mean()
std_temp = grouped['target_tmp'].std()

# print the results
print("Mean Temperature by Region: \n", mean_temp)
print("Standard deviation Temperature by Region: \n", std_temp)

In [None]:
agg_temp = train_df.groupby(['region', 'startdate'])['target_tmp']\
                .agg(Mean='mean', Std='std', Count='count')\
                .sort_values('Std', ascending = False).reset_index()

agg_temp.head(10)

In [None]:
# for the 01/01/2015, how much did temperature differe per region?

specific_date_df = train_df[train_df['startdate'] == '2015-01-01']

fig, ax = plt.subplots(figsize=(10, 8))

sns.boxplot(x="region", y="target_tmp", data=specific_date_df, ax=ax)
plt.show()

#### Display each temperature forecasts together with the target_tmp, to see how well the forecasts do

In [None]:
# subset df to only contain nmme columns, startdate, lon_lat and target for visualisation
df_filtered = train_df.filter(regex='^nmme-tmp|^nmme0-tmp|^target|^startdate|^lon_lat')
df_filtered.head()

In [None]:
# choose a small date range and a specific region

df_example = df_filtered[(df_filtered['startdate']< '2015-10-01') & (df_filtered['lon_lat']== '0.83333333333333_0.0')]
del df_example['lon_lat']
df_example.head()


In [None]:
# # plot the forecasts vs target_tmp

# df_example.set_index('startdate', inplace=True)


# # Loop through the temperature forecasting columns
# for column in df_example.columns:
#     if column != 'target_tmp':
#         # Create a line plot of the current temperature column and target_tmp
#         df_example[[column, 'target_tmp']].plot(kind='line')

#         # Add a title and labels for the x and y axis
#         plt.title(f'Temperature comparison ({column} vs target_tmp)')
#         plt.xlabel('Date')
#         plt.ylabel('Temperature (Â°C)')

#         # Show the plot
#         plt.show()

### calculate rmse for forcasts

To check how well the various forecasts do, let's calculate their rmse 

In [None]:
df_filtered = train_df.filter(regex='^nmme-tmp|^nmme0-tmp|nmme0$|^target|^cancm30|^cancm40|^ccsm30|^ccsm40|^cfsv20')

actual = df_filtered['target_tmp']

for column in df_filtered.columns:
    if column != 'target_tmp':
        temp_df = df_filtered[column]
        mean_value= df_filtered[column].mean()
        temp_df.fillna(value=mean_value, inplace=True)
        predicted = temp_df
        mse = sklearn.metrics.mean_squared_error(actual, predicted)
        rmse = math.sqrt(mse)
        print(column,': ','\n', rmse, '\n')

In [None]:
# create a df which does not contain the forecasts and certain columns which don't seem to be useful
# when comparing train vs test distributions

forecast_models_col_list = train_df.filter(regex='^nmme|^cancm|^ccsm|cfsv20|nasa0|gfdlflora0|gfdlflorb0|gfdl0|^icec')\
                         .columns.tolist()

#leave only one model in the data (the one with the lowest rmse from above) and the most relevant prate model
forecast_models_col_list.remove('nmme0mean')
forecast_models_col_list.remove('nmme0-prate-56w__cancm30')
forecast_models_col_list

train_df = train_df.drop(forecast_models_col_list, axis=1)
test_df = test_df.drop(forecast_models_col_list, axis=1)


### Creating dataframes for the same type of columns to implement correlation matrices

In [None]:
def filter_df(df, string):
    filtered_df = df.filter(regex=f'^{string}|^target_tmp')
    return filtered_df


In [None]:
icec_df = filter_df(train_df, "icec")
icec_df.head()

In [None]:
# nmme_prate_df = filter_df(train_df, "nmme-prate")
# nmme0_prate_df = filter_df(train_df, "nmme0-prate")
# nmme_tmp_df = filter_df(train_df, "nmme-tmp")
# nmme0_tmp_df = filter_df(train_df, "nmme0-tmp")
# sst_df = filter_df(train_df, "sst")
# wind_hgt_10_df = filter_df(train_df, "wind-hgt-10-")
# wind_hgt_850_df = filter_df(train_df, "wind-hgt-850-")
# wind_uwnd_250_df = filter_df(train_df, "wind-uwnd-250")
# wind_vwnd_250_df = filter_df(train_df, "wind-vwnd-250")

In [None]:
def create_corr_matrix(df): 

    # Compute the correlation matrix
    corr = df.corr()

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(10, 210, as_cmap=True)
    
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr,cmap=cmap, annot=True, vmin=-1, vmax=1, center=0)

    plt.show()

In [None]:
# create_corr_matrix(icec_df)

In [None]:
# create_corr_matrix(nmme_prate_df)

In [None]:
# create_corr_matrix(nmme0_prate_df)

In [None]:
# create_corr_matrix(nmme_tmp_df)

In [None]:
# create_corr_matrix(nmme0_tmp_df)

In [None]:
# create_corr_matrix(sst_df)

In [None]:
# create_corr_matrix(wind_hgt_850_df)

In [None]:
# create_corr_matrix(wind_uwnd_250_df)

In [None]:
# create_corr_matrix(wind_vwnd_250_df)

## XGBoost 

#### Feature engineering

* Create time features
* Make them cyclical where necesarry

In [None]:
# Resource: https://colab.research.google.com/drive/10r73mOp1R7cORfeuP97V65a-rgwGyfWr?usp=sharing#scrollTo=p8piDWNxFq0H

from sklearn.preprocessing import FunctionTransformer

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def encode_cyclical(df):
    
    # encode the day with a period of 365
    df['day_sin'] = sin_transformer(365).fit_transform(df['startdate'].dt.day)
    df['day_cos'] = cos_transformer(365).fit_transform(df['startdate'].dt.day)

    # encode the week with a period of 52
    df['week_sin'] = sin_transformer(52).fit_transform(df['startdate'].dt.isocalendar().week)
    df['week_cos'] = cos_transformer(52).fit_transform(df['startdate'].dt.isocalendar().week)

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['startdate'].dt.month)
    df['month_cos'] = cos_transformer(12).fit_transform(df['startdate'].dt.month)
    
    # year does not need encoding
    df['year'] = df['startdate'].dt.year 

In [None]:
encode_cyclical(train_df)
encode_cyclical(test_df)
train_df.head()

### Encode categorical variables

#### One-hot-encoding

In [None]:
# # encode categorical variables so that they can be used by the model

# train_df_onehot = no_fc_df.copy()
# train_df_onehot = pd.get_dummies(train_df_onehot, columns=['year', 'month', 'week','day', 'region', 'lon_lat'])

# #drop columns we don't need anymore
# train_df_onehot = train_df_onehot.drop(['lon', 'lat'], axis=1)

# train_df_onehot.head()

#### Target encoding

In [None]:
# set up the encoder
encoder = TargetEncoder(cols=['region', 'lon_lat'], smoothing = 10)

# fit the encoder - finds the mean target value per category
train_df_no_target = train_df.drop(['target_tmp'], axis=1)
encoder.fit(train_df_no_target, train_df['target_tmp'])

# transform data
encoded_df_train = encoder.transform(train_df_no_target)
encoded_df_test = encoder.transform(test_df)

encoded_df_train.drop(['index', 'lat', 'lon'], axis=1, inplace=True)
encoded_df_test.drop(['index','lat', 'lon'], axis=1, inplace=True)

encoded_df_train = pd.concat([encoded_df_train,train_df['target_tmp']], axis=1)
encoded_df_train.head()
# encoded_df_test.head()

#### Perform PCA for dimensionality reduction

In [None]:
# startdate_df = encoded_df_train[['startdate', 'target_tmp']]
# startdate_df.head()

In [None]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# import numpy as np

In [None]:
# #define scaler
# scaler = StandardScaler()

# #create copy of DataFrame
# scaled_df = encoded_df_train.copy()
# # scaled_df = scaled_df.drop(['year', 'month', 'week', 'day', 'region', 'lon_lat', 'startdate', 'lon', 'lat', 'target_tmp'], axis=1)
# scaled_df = scaled_df.drop(['startdate'], axis=1)
# pca_columns = list(scaled_df.columns.values)

# #created scaled version of DataFrame
# scaled_df = pd.DataFrame(scaler.fit_transform(scaled_df), columns=scaled_df.columns)
# scaled_df.head()

In [None]:
# scaled_df.shape

In [None]:
# #define PCA model to use
# pca = PCA(n_components=50)

# #fit PCA model to data
# pca_fit = pca.fit(scaled_df)


In [None]:
# pc_values = np.arange(pca.n_components_) + 1
# plt.plot(pc_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
# plt.title('Scree Plot')
# plt.xlabel('Principal Component')
# plt.ylabel('Variance Explained')
# plt.show() 

In [None]:
# plt.plot(np.cumsum(pca_fit.explained_variance_ratio_))
# plt.grid()
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Explained Variance')
# sns.despine();

In [None]:
# # transform the original dataset to the reduced dataset
# reduced_df = pca_fit.transform(scaled_df)
# reduced_df = pd.DataFrame(reduced_df)
# reduced_df.shape

In [None]:
# # merge the reduced dataframe with the startdate

# train_df_onehot_reduced = encoded_df_train.drop(pca_columns, axis=1)
# train_df_onehot_reduced.head()
# train_df_onehot_reduced.shape

In [None]:
# merged_df = pd.concat([reduced_df, startdate_df], axis=1)
# merged_df.shape

In [None]:
train = encoded_df_train.loc[encoded_df_train['startdate'] < '2016-07-01' ]
validation = encoded_df_train.loc[encoded_df_train['startdate'] >= '2016-07-01' ]

In [None]:
train = train.drop(['startdate'], axis=1)
len(train)

In [None]:
validation = validation.drop(['startdate'], axis=1)
len(validation)

In [None]:
no_target = train.drop(['target_tmp'], axis=1)
no_target.head()

In [None]:
#Get the list of all column names from headers
features = list(no_target.columns.values)
target = 'target_tmp'


In [None]:
X_train = train[features]
y_train = train[target]

X_val = validation[features]
y_val = validation[target]

In [None]:
# data_drift_list = encoded_df_train.filter(regex='^mei|^sst|^wind|contest-pevpr-sfc-gauss-14d__pevpr')\
#                          .columns.tolist()

# X_train_2 = X_train[data_drift_list]
# X_val_2 = X_val[data_drift_list]

In [None]:
# data_drift_list

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
# from numpy import absolute

In [None]:
# X = train_df_onehot[features]
# y = train_df_onehot[target]

In [None]:
# # why did it have an extra index column?

# X.drop(columns=['index'], inplace=True)
# X.head()

In [None]:
# y.head()

In [None]:
# model = xgb.XGBRegressor(n_estimators=100, learning_rate = 0.1, max_depth = 5, gamma = 1, subsample = 0.5)
# # define model evaluation method
# cv = RepeatedKFold(n_splits=2, n_repeats=1, random_state=1)
# # evaluate model
# scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1, verbose = 1)
# # force scores to be positive
# scores = absolute(scores)
# print('RMSE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
# # BEST model so far, with rmse of 0.90139 after 1000 rounds (n_estimators = 1000), next try with 10000)

# reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds = 30, learning_rate = 0.02, max_depth = 5, gamma = 1, subsample = 0.5) 
# reg.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], verbose = True)

In [None]:
# Try next

# reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds = 30, learning_rate = 0.02, max_depth = 5, gamma = 1) 
# reg.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], verbose = True)

In [None]:
# # average rmse: 0.95

# from datetime import datetime, timedelta

# exclude_cols = ['startdate']
# features = [c for c in encoded_df_train.columns if ((c != target) & (c not in exclude_cols))]
# target = 'target_tmp'

# N_FOLDS = 5

# for fold in range(N_FOLDS):
#     training_date = encoded_df_train['startdate'].max() - timedelta(14) * (N_FOLDS-fold)
#     valid_date = training_date + timedelta(14)
#     print(f"\nFold {fold}: \ntraining data from {encoded_df_train['startdate'].min()} to {training_date}\nvalidation data from {training_date + timedelta(1)} to {valid_date}")
#     train = encoded_df_train[encoded_df_train['startdate'] <= training_date]
#     val  = encoded_df_train[(encoded_df_train['startdate'] > training_date) & (encoded_df_train['startdate'] <= valid_date) ]
    
#     X_train = train[features]
#     y_train = train[target]

#     X_val = val[features]
#     y_val = val[target]
    
#     reg = xgb.XGBRegressor(n_estimators=1000, learning_rate = 0.05, max_depth = 8, gamma = 1, early_stopping_rounds = 30, seed=30) 
#     reg.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], verbose = 1)


In [None]:
# exclude_cols = ['startdate']
# features = [c for c in encoded_df_train.columns if ((c != target) & (c not in exclude_cols))]
# target = 'target_tmp'

# X_train = encoded_df_train[features]
# y_train = encoded_df_train[target]

# model = xgb.XGBRegressor(n_estimators=2000, learning_rate = 0.05, max_depth = 6, gamma = 1, seed=30) 
# model.fit(X_train, y_train)


In [None]:
# with n_estimators=1000, learning_rate = 0.1, max_depth = 6, gamma = 1, RMSE = 0.97 (forgot to put seed with this)

# training the model
# model = xgb.XGBRegressor(n_estimators=500, learning_rate = 0.08, max_depth = 6, seed=30, early_stopping_rounds = 100, gamma=1) 
# model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], verbose = True)


In [None]:
# model_2 = xgb.XGBRegressor(n_estimators=100, learning_rate = 0.1, max_depth = 6, seed=30, early_stopping_rounds = 100, gamma=1) 
# model_2.fit(X_train_2, y_train, eval_set = [(X_train_2, y_train), (X_val_2, y_val)], verbose = True)

In [None]:
# from xgboost import plot_importance

# # plot feature importance
# plot_importance(model, max_num_features = 30)
# plt.show()

In [None]:
# # from xgboost import plot_importance

# # plot feature importance
# plot_importance(model_2, max_num_features = 30)
# plt.show()

In [None]:
# fig, ax = plt.subplots()
# sns.kdeplot(data=train_df['contest-pevpr-sfc-gauss-14d__pevpr'],  color='blue', fill=True, ax=ax)
# sns.kdeplot(data=test_df['contest-pevpr-sfc-gauss-14d__pevpr'], color='orange', fill=True, ax=ax)
# plt.show()

In [None]:
# # loading the sample submission file
# submission = pd.read_csv('sample_solution.csv')
# submission.head()

In [None]:
# # #making predictions and replacing the values of the sample file
# target_variable = 'contest-tmp2m-14d__tmp2m'
# submission[target_variable] = model.predict(encoded_df_test[features])

In [None]:
# #making predictions and replacing the values of the sample file
# # target_variable = 'contest-tmp2m-14d__tmp2m'
# submission['model_2'] = model_2.predict(encoded_df_test[data_drift_list])

In [None]:
# submission[target_variable]= submission['model_1'] * 0.9 + submission['model_2'] * 0.1

In [None]:
# submission.drop(['model_1', 'model_2'], axis=1, inplace = True)

In [None]:
# submission.tail(20)

In [None]:
# #save the submission file
# submission.to_csv('submission.csv', index = False) 

### Catboost

In [None]:
# import catboost
# print(catboost.__version__)

In [None]:
# from catboost import CatBoostRegressor, Pool

In [None]:
# cb = CatBoostRegressor(n_estimators=500, learning_rate = 0.1, max_depth = 6, early_stopping_rounds = 100, loss_function='RMSE', verbose = True)

# pool_train = Pool(X_train, y_train, cat_features = ['region', 'lon_lat'])

# pool_val = Pool(X_val, cat_features = ['region', 'lon_lat'])

# pool_test = Pool(test_df, cat_features = ['region', 'lon_lat'])

# cb.fit(pool_train, eval_set = [(X_train, y_train), (X_val, y_val)])
# y_pred = cb.predict(pool_val)

# model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)])


In [None]:
# # loading the sample submission file
# submission = pd.read_csv('sample_solution.csv')
# submission.head()

In [None]:
# #making predictions and replacing the values of the sample file
# target_variable = 'contest-tmp2m-14d__tmp2m'
# submission[target_variable] = cb.predict(pool_test)

In [None]:
# submission.tail(20)

In [None]:
# #save the submission file
# submission.to_csv('submission.csv', index = False) 

In [None]:
# bla = test_df[features].columns.tolist()
# bla

In [None]:
X_train_lgbm = X_train.drop(['week_sin', 'week_cos'], axis=1)
X_val_lgbm = X_val.drop(['week_sin', 'week_cos'], axis=1)

In [None]:
from lightgbm import LGBMRegressor, plot_importance 

In [None]:
lgbm = LGBMRegressor(n_estimators=5000, learning_rate = 0.05, max_depth = 6, seed=30, early_stopping_rounds = 100, loss_function='RMSE')
lgbm.fit(X_train_lgbm, y_train, eval_set = [(X_train_lgbm, y_train), (X_val_lgbm, y_val)], verbose = True)


In [None]:
import lightgbm

plt.rcParams["figure.figsize"] = (12, 22)

lightgbm.plot_importance(lgbm, max_num_features = 60, height=.9)

In [None]:
# loading the sample submission file
submission = pd.read_csv('sample_solution.csv')
submission.head()

In [None]:
test = encoded_df_test[features].drop(['week_sin', 'week_cos'], axis=1)

In [None]:
# #making predictions and replacing the values of the sample file
target_variable = 'contest-tmp2m-14d__tmp2m'
submission[target_variable] = lgbm.predict(test)

In [None]:
submission.tail(20)

In [None]:
#save the submission file
submission.to_csv('submission.csv', index = False) 