# **Using CatBoost ensembler model with dropping some features** 

referred [CatBoostBegineer](https://www.kaggle.com/ycca1018/wids-2022-catboost-beginner-jp-en-score-31-71/notebook) notebook 

In [None]:
# importing all the libraries 
import os
import gc
import copy

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file 

from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error 
import lightgbm as lgb 

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':18})
plt.style.use('ggplot')
import seaborn as sns
from scipy import stats

import shap

from sklearn.preprocessing import StandardScaler

import optuna.integration.lightgbm as lgbm 
import optuna


import warnings
warnings.filterwarnings('ignore')

import wandb


In [None]:
train = pd.read_csv("../input/d/mansi55/widsdataset/train.csv")
test = pd.read_csv("../input/d/mansi55/widsdataset/test.csv")

print("Number of train samples are", train.shape)
print("Number od test samples are", test.shape)
categorical_features = ['State_Factor', 'building_class','facility_type']
numerical_features = train.select_dtypes('number').columns

In [None]:
train.head()

In [None]:
train.shape

 **Check which all columns have missing values**

In [None]:
plt.figure(figsize = (25, 11))
sns.heatmap(train.isna().values, cmap = ['#ffd514','#ff355d'], xticklabels=train.columns)
plt.title("Missing values in training data", size=20)

In [None]:
missing_columns = [col for col in train.columns if train[col].isnull().any()]
missingvalues_count = train.isna().sum()
missingvalues_df = pd.DataFrame(missingvalues_count.rename("Null Values Count")).loc[missingvalues_count.ne(0)]
missingvalues_df.style

In [None]:
#basic stats of features 
train.describe().style

**Find out how traget variable is distributed inorder to find outliers and all**

In [None]:
plt.figure(figsize = (15,7))
plt.subplot(121)
sns.kdeplot(train.site_eui, color="#ffd514")
plt.subplot(122)
sns.boxplot(train.site_eui, color="#ff355d")

In [None]:
res = stats.probplot(train['site_eui'], plot=plt) #probability plot

**Find out how numeric and categorical features are distributed**

In [None]:
def kdeplot_features(df_train, df_test, feature, title):
    '''Takes a column from the dataframe and plots the distribution (after count).'''
    
    values_train = df_train[feature].to_numpy()
    values_test = df_test[feature].to_numpy()
    
    plt.figure(figsize = (18,3))
    
    sns.kdeplot(values_train, color = '#ffd514')
    sns.kdeplot(values_test, color = '#ff355d')
    
    plt.title(title, fontsize=15)
    plt.legend()
    plt.show();
    
    del values_train, values_test
    gc.collect()

################################

def countplot_features(df_train, feature, title):
    '''Takes a column from the dataframe and plots the distribution (after count).'''
    
    plt.figure(figsize = (10, 5))
    
    sns.countplot(df_train[feature], color = "#ff355d")
    plt.title(title, fontsize=15)
    plt.show()
    
###################################

def create_wandb_hist(x_data=None, x_name=None, title=None, log=None):
    '''Create and save histogram in W&B Environment.
    x_data: Pandas Series containing x values
    x_name: strings containing axis name
    title: title of the graph
    log: string containing name of log'''
    
    data = [[x] for x in x_data]
    table = wandb.Table(data=data, columns=[x_name])
    wandb.log({log : wandb.plot.histogram(table, x_name, title=title)})
    

In [None]:
# plot distributions of numerical features

for feature in numerical_features:
    if feature != "site_eui":
        kdeplot_features(train, test, feature=feature, title = feature+" distribution")

In [None]:
# plot distributions of categorical features 

for feature in categorical_features:
    print(train.State_Factor.unique())
    fig = countplot_features(train, feature=feature, title = "Frequency of "+feature)

**Handling nan/missing values in categorical features using Label encoding**

In [None]:
str_list = [] 
num_list = []
for colname, colvalue in train.iteritems():
    if type(colvalue[1]) == str:
        str_list.append(colname)
    else:
        num_list.append(colname)
        
for col in str_list:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

    for label in np.unique(test[col]):
        if label not in encoder.classes_: 
            encoder.classes_ = np.append(encoder.classes_, label) 
    test[col] = encoder.transform(test[col])

In [None]:
train

**dropping those columns whose values are 0 in test data** 

In [None]:
test.isnull().sum()

In [None]:
train.drop(['direction_max_wind_speed','direction_peak_wind_speed','max_wind_speed','days_with_fog'],axis=1,inplace=True)
test.drop(['direction_max_wind_speed','direction_peak_wind_speed','max_wind_speed','days_with_fog'],axis=1,inplace=True)

In [None]:
temp_list=[i for i in train.columns if 'temp' in i if i!='avg_temp']
temp_list_2=[i for i in temp_list if ('january' not in i)&('july' not in i)]

In [None]:
temp_list_2

In [None]:
train.drop(temp_list_2,axis=1,inplace=True)
test.drop(temp_list_2,axis=1,inplace=True)

In [None]:
train

**Drop site_eui and id columns** 

In [None]:
X = train.drop(["site_eui","id"],axis =1)
y = train["site_eui"]

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 5)

**Using CatBoost Regressor**

In [None]:
import catboost as cb


MODEL_MAX_DEPTH = 12
MODEL_TASK_TYPE = 'GPU'#'GPU'
MODEL_RL = 0.025
MODEL_EVAL_METRIC ='RMSE'
MODEL_LOSS_FUNCTION = 'RMSE'
MODEL_ESR = 10
MODEL_VERBOSE = 1000
MODEL_ITERATIONS = 28000

SEED = 2022

catBoostmodel = cb.CatBoostRegressor(
    verbose=MODEL_VERBOSE,
    early_stopping_rounds=MODEL_ESR,
    random_seed=SEED,
    max_depth=MODEL_MAX_DEPTH,
    task_type=MODEL_TASK_TYPE,
    learning_rate=MODEL_RL,
    iterations=MODEL_ITERATIONS,
    loss_function=MODEL_LOSS_FUNCTION,
    eval_metric= MODEL_EVAL_METRIC
)


In [None]:
catBoostmodel.fit(X, y)

In [None]:
# train_dataset = cb.Pool(X_train, y_train) 

In [None]:
# grid = {'iterations': [10000, 28000],
#         'learning_rate': [0.03, 0.1,0.025],
#         'depth': [2, 4, 6, 8,12],
#         'l2_leaf_reg': [0.2, 0.5, 1, 3]}
# catBoostmodel.grid_search(grid, train_dataset)

In [None]:
# from sklearn.metrics import r2_score

# pred = catBoostmodel.predict(X_test)
# rmse = (np.sqrt(mean_squared_error(y_test, pred)))
# r2 = r2_score(y_test, pred)

In [None]:
# print("Testing performance")
# print('RMSE: {:.2f}'.format(rmse))
# print('R2: {:.2f}'.format(r2))

In [None]:
# catBoostmodel.fit(X_train, y_train)

In [None]:
# from sklearn.model_selection import cross_val_score
# accuracies = cross_val_score(estimator = catBoostmodel, X = X_train, y = y_train, cv = 5)
# print("Accuracy:{:.2f} %".format(accuracies.mean()*100))

In [None]:
#submission
X_test = test.drop(["id"],axis=1)
pred_test = catBoostmodel.predict(X_test)
sub = pd.DataFrame(test['id'],columns={'id'})
sub["site_eui"] = pred_test
sub.to_csv('submission.csv', index=False)
sub.head()