In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
generation_df = pd.read_csv('/kaggle/input/enerjisaverimaratonu/generation.csv', sep=';', decimal=",")
generation_df.head()

In [None]:
generation_df.shape

In [None]:
generation_df.info()

In [None]:
generation_df.describe().T

In [None]:
temperature_df = pd.read_csv('/kaggle/input/enerjisaverimaratonu/temperature.csv', sep=";", decimal=",")
temperature_df.head()

In [None]:
temperature_df.shape

In [None]:
temperature_df.info()

In [None]:
temperature_df.describe().T

In [None]:
temperature_df.describe(include=["O"]).T

In [None]:
generation_df.tail()

In [None]:
generation_df[25550:25561]

In [None]:
generation_df[25560:].isnull().sum()

In [None]:
generation_df = generation_df[:25560]
generation_df.tail()

In [None]:
temperature_df = temperature_df[:26304]
temperature_df.tail()

In [None]:
def outlier_suppression(variable, df):
    df_variable = df[variable]
    Q1 = df_variable.quantile(0.25)
    Q3 = df_variable.quantile(0.75)
    IQR = Q3-Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR
    outlier_lower = (df_variable < lower_limit)
    outlier_upper = (df_variable > upper_limit)
    df_variable[outlier_lower] = lower_limit
    df_variable[outlier_upper] = upper_limit

In [None]:
outlier_suppression("Generation", generation_df)
for column in range(2, temperature_df.columns.size):
    outlier_suppression(temperature_df.columns[column], temperature_df)

In [None]:
temperature_df['DateTime'] = pd.to_datetime(temperature_df['DateTime'])
temperature_df["Day"] = temperature_df['DateTime'].dt.day
temperature_df["Month"] = temperature_df['DateTime'].dt.month
temperature_df["Year"] = temperature_df['DateTime'].dt.year
temperature_df["Hour"] = temperature_df['DateTime'].dt.hour

day_mapping={
    0: 'Monday', 
    1: 'Tuesday', 
    2: 'Wednesday', 
    3: 'Thursday', 
    4: 'Friday',
    5: 'Saturday', 
    6: 'Sunday'
}
trainer = temperature_df.copy()
test_df = trainer[25560:26304]
temperature_df['Day of The Week']=temperature_df['DateTime'].dt.weekday.map(day_mapping)

import calendar
temperature_df['Month'] = temperature_df['Month'].apply(lambda x: calendar.month_abbr[x])

In [None]:
temperature_df.head()

In [None]:
enerji_df = pd.concat([temperature_df, generation_df["Generation"]], axis=1)
enerji_df.head(10)

In [None]:
enerji_df = enerji_df.set_index(['DateTime'])

In [None]:
enerji_df.tail(20)

In [None]:
enerji_df.isnull().sum()

In [None]:
enerji_df["WWCode"].fillna(enerji_df["WWCode"].median(), inplace=True)
enerji_df.isnull().sum()

In [None]:
enerji_df.describe().T

In [None]:
enerji_df.describe(include=["O"]).T

In [None]:
enerji_df[["Day", "Generation"]].groupby(["Day"]).mean().sort_values(by="Generation", ascending=False)

In [None]:
enerji_df[["Month", "Generation"]].groupby(["Month"]).mean().sort_values(by="Generation", ascending=False)

In [None]:
enerji_df[["Year", "Generation"]].groupby(["Year"]).mean().sort_values(by="Generation", ascending=False)

In [None]:
enerji_df[["Hour", "Generation"]].groupby(["Hour"]).mean().sort_values(by="Generation", ascending=False)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
cloud_cat = []
for rate in enerji_df["EffectiveCloudCover"]:
    if rate >= 0 and rate < 1:
        cloud_cat.append("Sky Clear")
    elif rate >= 1 and rate < 3:
        cloud_cat.append("Few")
    elif rate >= 3  and rate < 5:
        cloud_cat.append("Scattered")
    elif rate >= 5  and rate < 7:
        cloud_cat.append("Broken")
    else:
        cloud_cat.append("Overcast")
        
enerji_df["CloudCategory"] = cloud_cat

In [None]:
enerji_df[["CloudCategory", "Generation"]].groupby(["CloudCategory"]).mean().sort_values(by="Generation", ascending=False)

In [None]:
airTemp_cat = []
for temp in enerji_df["AirTemperature"]:
    if temp < 0:
        airTemp_cat.append("Very Cold")
    elif temp >= 0 and temp < 15:
        airTemp_cat.append("Cold")
    elif temp >= 15  and temp < 25:
        airTemp_cat.append("Warm")
    elif temp >= 25  and temp < 35:
        airTemp_cat.append("Hot")
    else:
        airTemp_cat.append("Very Hot")
        
enerji_df["AirTemperatureCategory"] = airTemp_cat

In [None]:
enerji_df[["AirTemperatureCategory", "Generation"]].groupby(["AirTemperatureCategory"]).mean().sort_values(by="Generation", ascending=False)

In [None]:
corr_df = enerji_df.corr()
corr_df

In [None]:
enerji_df.head(10)

In [None]:
trainer["WWCode"].fillna(trainer["WWCode"].median(), inplace=True)

In [None]:
trainer = trainer.drop("DateTime", axis=1)
trainer.head()

In [None]:
from sklearn import preprocessing
standard_X = preprocessing.scale(trainer[:25560])
standard_y = preprocessing.scale(generation_df["Generation"])

In [None]:
X = standard_X
y = generation_df["Generation"]

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
import catboost as cb

In [None]:
test_df.drop("DateTime", axis=1, inplace=True)

In [None]:
test_df = preprocessing.scale(test_df)

In [None]:
test_df[:10]

In [None]:
tuned_cb = cb.CatBoostRegressor(depth=10, iterations=2000, learning_rate=0.1)
tuned_cb.fit(X_train, y_train)
y_pred = tuned_cb.predict(test_df)

In [None]:
feature_importance = tuned_cb.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), trainer.columns[sorted_idx])
plt.title('Feature Importance')

In [None]:
new_pred = []
for i in y_pred:
    if i < 0:
        new_pred.append(0)
    else:
        new_pred.append(i)

In [None]:
submission = pd.read_csv('/kaggle/input/enerjisaverimaratonu/sample_submission.csv')
submission['Generation'] = new_pred

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
enerji_df.loc["2020-12-24"]["Generation"]