In [41]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [42]:
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)


%matplotlib inline
sns.set(style="whitegrid", palette="muted", font_scale=1.5)
plt.rcParams["figure.figsize"] = (10, 5)

In [43]:
data = pd.read_csv("../input/widsdatathon2022/train.csv")
test_data = pd.read_csv("../input/widsdatathon2022/test.csv")

In [44]:
print("Number of train samples are", data.shape)
print("Number of test samples are", test_data.shape)

In [45]:
data.head()


In [46]:
data.columns


In [47]:
data.describe(include="all").T.style

In [48]:
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
miss_count = data.isna().sum()
miss_df = pd.concat(
    [
        miss_count.rename("Missing count"),
        miss_count.div(len(data)).rename("Missing value"),
    ],
    axis=1,
).loc[miss_count.ne(0)]
miss_df.style.background_gradient(cmap="coolwarm")

In [49]:
categorical_cols = [c for c in data.columns if (1 < data[c].nunique()) & (data[c].dtype != np.number) & (data[c].dtype != int)
]
categorical_cols

In [50]:
for col in categorical_cols:
    data[col] = data[col].astype("category")

In [51]:
num_col = data.select_dtypes("number").columns
num_col

In [52]:
sns.distplot(data["site_eui"])

In [53]:
plt.figure(figsize=(15, 15))
sns.set(style="white")
i = 1
for col in categorical_cols:
    plt.subplot(3, 1, i)
    ax = sns.boxplot(data=data, x=col, y="site_eui")
    i = i + 1
    ax.tick_params(labelrotation=45)
plt.show()

In [54]:
cols = [
    "Year_Factor",
    "floor_area",
    "year_built",
    "energy_star_rating",
    "ELEVATION",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowfall_inches",
    "snowdepth_inches",
    "avg_temp",
    "days_below_30F",
    "days_below_20F",
    "days_below_10F",
    "days_below_0F",
    "days_above_80F",
    "days_above_90F",
    "days_above_100F",
    "days_above_110F",
    "direction_max_wind_speed",
    "direction_peak_wind_speed",
    "max_wind_speed",
    "days_with_fog",
]

for col in num_col:
    plt.figure(figsize=(15, 2))
    sns.distplot(data[col], kde=True)
    plt.title(col + " distribution")
    plt.show()

In [55]:
if len(data['id'].unique())==len(data):
   print("id values are unique")

In [56]:
plt.figure(figsize=(15, 5))

min_temp = [
    "january_min_temp",
    "february_min_temp",
    "march_min_temp",
    "april_min_temp",
    "may_min_temp",
    "june_min_temp",
    "july_min_temp",
    "august_min_temp",
    "september_min_temp",
    "october_min_temp",
    "november_min_temp",
    "december_min_temp",
]
for col in min_temp:
    sns.kdeplot(data[col], shade=True)
    plt.xlabel("Temperature")
    plt.title("Monthly distribution of min temp")
plt.legend(min_temp)

plt.figure(figsize=(15, 5))
max_temp = [
    "january_max_temp",
    "february_max_temp",
    "march_max_temp",
    "april_max_temp",
    "may_max_temp",
    "june_max_temp",
    "july_max_temp",
    "august_max_temp",
    "september_max_temp",
    "october_max_temp",
    "november_max_temp",
    "december_max_temp",
]
for col in max_temp:
    sns.kdeplot(data[col], shade=True)
    plt.xlabel("Temperature")
    plt.title("Monthly distribution of max temp")
plt.legend(max_temp)

plt.figure(figsize=(15, 5))
avg_temp = [
    "january_avg_temp",
    "february_avg_temp",
    "march_avg_temp",
    "april_avg_temp",
    "may_avg_temp",
    "june_avg_temp",
    "july_avg_temp",
    "august_avg_temp",
    "september_avg_temp",
    "october_avg_temp",
    "november_avg_temp",
    "december_avg_temp",
]
for col in avg_temp:
    sns.kdeplot(data[col], shade=True)
    plt.xlabel("Temperature")
    plt.title("Monthly distribution of avg temp")
plt.legend(avg_temp)

In [57]:
data_cor = data.corr()
sns.heatmap(data_cor)
data_df_upper = data_cor.where(np.triu(np.ones(data_cor.shape, dtype=int), k=1).astype(bool))
cor80 = [col for col in data_df_upper.columns if any(abs(data_df_upper[col]) >= 0.8)]

print(cor80)

In [58]:
data["year_built"] = data["year_built"].replace(np.nan, 2022)

## for test data
test_data["year_built"] = test_data["year_built"].replace(np.nan, 2022)

In [59]:
from sklearn.impute import SimpleImputer

null_col = [
    "energy_star_rating",
    "direction_max_wind_speed",
    "direction_peak_wind_speed",
    "max_wind_speed",
    "days_with_fog",
]
data[null_col].describe()

In [60]:
imputer = SimpleImputer(strategy='median')
imputer.fit(data[null_col])
data_transformed = imputer.transform(data[null_col])
data[null_col]=pd.DataFrame(data_transformed)
## for test data
test_data_transformed = imputer.transform(test_data[null_col])
test_data[null_col] = pd.DataFrame(test_data_transformed)

In [61]:
data[null_col].describe()


In [62]:
# rechecking null values
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
cols_with_missing

In [63]:
y = data["site_eui"]
X = data.drop(["site_eui", "id"], axis=1)
ids=test_data.id
test_data.drop(["id"], axis=1, inplace=True)

In [64]:
X.select_dtypes('category')


In [65]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown="ignore")
X_onehot = ohe.fit_transform(X[categorical_cols])
X_onehot = pd.DataFrame(X_onehot.toarray(), columns=ohe.get_feature_names())
X = pd.concat([X.drop(categorical_cols, axis=1), X_onehot], axis=1)
test_data_onehot = ohe.transform(test_data[categorical_cols])
test_data_onehot = pd.DataFrame(
    test_data_onehot.toarray(), columns=ohe.get_feature_names()
)
test_data = pd.concat(
    [test_data.drop(categorical_cols, axis=1), test_data_onehot], axis=1
)

In [66]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_data)

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=50
)

In [68]:
from sklearn.linear_model import LinearRegression

baseline_model = LinearRegression()

baseline_model.fit(X_train, y_train)
y_train_baseline_predict = baseline_model.predict(X_train)
y_test_baseline_predict = baseline_model.predict(X_test)

In [69]:
plt.scatter(y_train, y_train_baseline_predict, s=5, alpha=0.7)
plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.show()

In [70]:
plt.scatter(y_test, y_test_baseline_predict, s=5, alpha=0.7)
plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.show()

In [71]:
from sklearn.metrics import mean_squared_error, r2_score

train_mse = mean_squared_error(y_train, y_train_baseline_predict)
test_mse = mean_squared_error(y_test, y_test_baseline_predict)
print(
    f"RMSE on the training data is {np.round(np.sqrt(train_mse), 2)}.",
    f"RMSE on the test data is {np.round(np.sqrt(test_mse), 2)}.",
)

train_r2 = r2_score(y_train, y_train_baseline_predict)
test_r2 = r2_score(y_test, y_test_baseline_predict)
print(
    f"r2 on the training data is {np.round(train_r2, 2)}.",
    f"r2 on the test data is {np.round(test_r2, 2)}.",
)

In [72]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm
#https://towardsdatascience.com/kagglers-guide-to-lightgbm-hyperparameter-tuning-with-optuna-in-2021-ed048d9838b5
model = lightgbm.LGBMRegressor()

# those are just some example parameters to help you get started!
parameters = {'num_leaves':[140, 200, 300], 'max_depth':[10, 15], 'device' : ['gpu'],
             'learning_rate':[0.05],'reg_lambda':[0.01], 'min_data_in_leaf' : [10, 20, 50, 80]}
model_grid=RandomizedSearchCV(model,parameters,scoring='r2',n_iter=100, cv=5)
model_grid.fit(X_train, y_train)

print(model_grid.best_params_)
plt.plot(model_grid.cv_results_['mean_test_score'])
print(model_grid.best_score_, model_grid.cv_results_['mean_test_score'][0])

In [73]:
model = model_grid.best_estimator_
model.fit(X_train, y_train)

y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [74]:
train_mse = mean_squared_error(y_train, y_train_predict)
test_mse = mean_squared_error(y_test, y_test_predict)
print(
    f"RMSE on the training data is {np.round(np.sqrt(train_mse), 2)}.",
    f"RMSE on the test data is {np.round(np.sqrt(test_mse), 2)}.",
)

train_r2 = r2_score(y_train, y_train_predict)
test_r2 = r2_score(y_test, y_test_predict)
print(
    f"r2 on the training data is {np.round(train_r2, 2)}.",
    f"r2 on the test data is {np.round(test_r2, 2)}.",
)

In [75]:
plt.scatter(y_train, y_train_predict, s=5, alpha=0.7)
plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.show()

In [76]:
plt.scatter(y_test, y_test_predict, s=5, alpha=0.7)
plt.xlabel("True values")
plt.ylabel("Predicted values")
plt.show()

In [77]:
# testdata prediction
prediction = model_grid.best_estimator_.predict(test_scaled)
final=pd.DataFrame({'id': ids, 'site_eui':prediction})
final.head()
final.to_csv('submission.csv', index=False)