# Preparation

In [None]:
# Basic
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
# Encoding
from sklearn.preprocessing import LabelEncoder
# Modeling 
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
df_train = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/train_data.csv')
df_test = pd.read_csv('../input/a-fine-windy-day-hackerearth-ml-challenge/test_data.csv')

# Exploratory Data Analysis 

## About df_train & df_test

### df_train

In [None]:
df_train.head()

In [None]:
print(df_train.shape)

In [None]:
df_train.info()

In [None]:
df_train.describe()

### df_test

In [None]:
df_test.head()

In [None]:
print(df_test.shape)

In [None]:
df_test.info()

In [None]:
df_test.describe()

## Missing Values & Correlation

### df_train

In [None]:
df_train_total_na = df_train.isnull().sum().sort_values(ascending=False)
df_train_miss_rate = ((df_train.isnull().sum() / len(df_train)) * 100).sort_values(ascending=False)
df_train_miss_values = pd.concat([df_train_total_na, df_train_miss_rate], axis=1, keys=['Total','Missing Rate'])
df_train_miss_values.head(20)

In [None]:
msno.matrix(df_train)

In [None]:
msno.heatmap(df_train) 

In [None]:
corr = df_train.corr()
plt.figure(figsize=(12,12))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True,cbar=False,cmap='Blues')
plt.show()

### df_test

In [None]:
df_test_total_na = df_test.isnull().sum().sort_values(ascending=False)
df_test_miss_rate = ((df_test.isnull().sum() / len(df_test)) * 100).sort_values(ascending=False)
df_test_miss_values = pd.concat([df_test_total_na, df_test_miss_rate], axis=1, keys=['Total','Missing Rate'])
df_test_miss_values.head(20)

In [None]:
msno.matrix(df_test)

In [None]:
msno.heatmap(df_test) 

In [None]:
corr = df_test.corr()
plt.figure(figsize=(12,12))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True,cbar=False,cmap='Blues')
plt.show()

## Categorical & Numerical Features

In [None]:
num_features = []
cat_features = []
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        cat_features.append(col)
    else:
        num_features.append(col)
print('There are totally', len(num_features), 'numeric features. (Without Target)')
print('There are totally', len(cat_features), 'categorical features.')

In [None]:
num_features

In [None]:
cat_features

In [None]:
# About Numeric Features
plt.figure(figsize=(30, 60))
plt.subplots_adjust(hspace=0.2, wspace=0.2)
for i, feature in enumerate(num_features):
    plt.subplot(6, 3, i+1)
    sns.boxplot(y = feature, data=df_train)
plt.show()

When it comes to categorical features, we will not plot right now because 'Tracking_id' & 'datetime' will be processed later. And further analysis on all features will be done later.

In [None]:
cat_features.remove('tracking_id')
cat_features.remove('datetime')

## About Each Feature 

### wind_speed(m/s)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['wind_speed(m/s)'])
plt.title('Train Set')
plt.ylabel('wind_speed(m/s)')
plt.subplot(122)
sns.boxplot(y=df_test['wind_speed(m/s)'])
plt.title('Test Set')
plt.ylabel('wind_speed(m/s)')
plt.show()

### atmospheric_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['atmospheric_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('atmospheric_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['atmospheric_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('atmospheric_temperature(°C)')
plt.show()

### shaft_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['shaft_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('shaft_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['shaft_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('shaft_temperature(°C)')
plt.show()

### blades_angle(°)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['blades_angle(°)'])
plt.title('Train Set')
plt.ylabel('blades_angle(°)')
plt.subplot(122)
sns.boxplot(y=df_test['blades_angle(°)'])
plt.title('Test Set')
plt.ylabel('blades_angle(°)')
plt.show()

### gearbox_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['gearbox_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('gearbox_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['gearbox_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('gearbox_temperature(°C)')
plt.show()

### engine_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['engine_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('engine_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['engine_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('engine_temperature(°C)')
plt.show()

### motor_torque(N-m)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['motor_torque(N-m)'])
plt.title('Train Set')
plt.ylabel('motor_torque(N-m)')
plt.subplot(122)
sns.boxplot(y=df_test['motor_torque(N-m)'])
plt.title('Test Set')
plt.ylabel('motor_torque(N-m)')
plt.show()

### generator_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['generator_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('generator_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['generator_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('generator_temperature(°C)')
plt.show()

### atmospheric_pressure(Pascal)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['atmospheric_pressure(Pascal)'])
plt.title('Train Set')
plt.ylabel('atmospheric_pressure(Pascal)')
plt.subplot(122)
sns.boxplot(y=df_test['atmospheric_pressure(Pascal)'])
plt.title('Test Set')
plt.ylabel('atmospheric_pressure(Pascal)')
plt.show()

### area_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['area_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('area_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['area_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('area_temperature(°C)')
plt.show()

### windmill_body_temperature(°C)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['windmill_body_temperature(°C)'])
plt.title('Train Set')
plt.ylabel('windmill_body_temperature(°C)')
plt.subplot(122)
sns.boxplot(y=df_test['windmill_body_temperature(°C)'])
plt.title('Test Set')
plt.ylabel('windmill_body_temperature(°C)')
plt.show()

### wind_direction(°)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['wind_direction(°)'])
plt.title('Train Set')
plt.ylabel('wind_direction(°)')
plt.subplot(122)
sns.boxplot(y=df_test['wind_direction(°)'])
plt.title('Test Set')
plt.ylabel('wind_direction(°)')
plt.show()

### resistance(ohm)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['resistance(ohm)'])
plt.title('Train Set')
plt.ylabel('resistance(ohm)')
plt.subplot(122)
sns.boxplot(y=df_test['resistance(ohm)'])
plt.title('Test Set')
plt.ylabel('resistance(ohm)')
plt.show()

### rotor_torque(N-m)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['rotor_torque(N-m)'])
plt.title('Train Set')
plt.ylabel('rotor_torque(N-m)')
plt.subplot(122)
sns.boxplot(y=df_test['rotor_torque(N-m)'])
plt.title('Test Set')
plt.ylabel('rotor_torque(N-m)')
plt.show()

### blade_length(m)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['blade_length(m)'])
plt.title('Train Set')
plt.ylabel('blade_length(m)')
plt.subplot(122)
sns.boxplot(y=df_test['blade_length(m)'])
plt.title('Test Set')
plt.ylabel('blade_length(m)')
plt.show()

### blade_breadth(m)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['blade_breadth(m)'])
plt.title('Train Set')
plt.ylabel('blade_breadth(m)')
plt.subplot(122)
sns.boxplot(y=df_test['blade_breadth(m)'])
plt.title('Test Set')
plt.ylabel('blade_breadth(m)')
plt.show()

### windmill_height(m)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot(y=df_train['windmill_height(m)'])
plt.title('Train Set')
plt.ylabel('windmill_height(m)')
plt.subplot(122)
sns.boxplot(y=df_test['windmill_height(m)'])
plt.title('Test Set')
plt.ylabel('windmill_height(m)')
plt.show()

***

### turbine_status

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(121)
sns.countplot(data=df_train, x='turbine_status')
plt.title('Train Set')
plt.subplot(122)
sns.countplot(data=df_test, x='turbine_status')
plt.title('Test Set')
plt.show()

### cloud_level

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(121)
sns.countplot(data=df_train, x='cloud_level')
plt.title('Train Set')
plt.subplot(122)
sns.countplot(data=df_test, x='cloud_level')
plt.title('Test Set')
plt.show()

## About Target - windmill_generated_power(kW/h)

In [None]:
df_train['windmill_generated_power(kW/h)'].describe()

In [None]:
sns.displot(df_train['windmill_generated_power(kW/h)'], kde='Ture')

In [None]:
# Numeric Features & Target
plt.figure(figsize=(30, 60))
plt.subplots_adjust(hspace=0.2, wspace=0.2)
for i, feature in enumerate(num_features):
    plt.subplot(6, 3, i+1)
    sns.scatterplot(x=feature, y='windmill_generated_power(kW/h)', data=df_train, alpha=0.5)
    plt.xlabel(feature)
    plt.ylabel('windmill_generated_power(kW/h)')
plt.show()

In [None]:
# Categorical Features & Target
plt.figure(figsize=(15,20))
plt.subplots_adjust(hspace=0.2, wspace=0.2)
for i, feature in enumerate(cat_features):
    plt.subplot(2, 1, i+1)
    sns.violinplot(x=feature, y="windmill_generated_power(kW/h)", data=df_train)
    plt.xlabel(feature)
    plt.ylabel('windmill_generated_power(kW/h)')
plt.show()

## After EDA

1. 'tracking_id' can be deleted as it's useless.
2. 'windmill_body_temperature(°C)' seems to be have different distribution in training and testing set and it not highly correlated with target, so maybe we'll drop it.
3.  Some features have negative values. Negative value of wind_speed may represent the wind in the opposite flow. But most of negative values are meaningless, such as -99 and -999, which can be considered as missing values. These values should be removed as extreme outliers.
4. 'datetime' should be converted from object to timestamp or datetime format. We can also extract new features like 'Hour' & 'Month'.
5. Features with null values need to be imputed.  
6. Some pairs of features have high correlation or missing value correlation, we should pay attention to these features.

# Data Processing

In [None]:
train = df_train
test = df_test

## Handle Extreme Outliers

In [None]:
# About The Numeric Features & Outliers (Train set)
plt.figure(figsize=(30, 60))
plt.subplots_adjust(hspace=0.2, wspace=0.2)
for i, feature in enumerate(num_features):
    plt.subplot(6, 3, i+1)
    plt.hist(x=feature, data=train, bins=30, rwidth=0.9)
    plt.title(feature)
plt.show()

It's obvious that 'atmospheric_temperature(°C)', 'shaft_temperature(°C)', 'blades_angle(°)', 'windmill_body_temperature(°C)', 'resistance(ohm)', 'rotor_torque(N-m)', 'blade_length(m)' have extreme outliers like -99.

In [None]:
def nan_replace_outlier(col, value):
    train[col].replace(value, np.nan, inplace=True)
    test[col].replace(value, np.nan, inplace=True)

In [None]:
col_out = ['atmospheric_temperature(°C)', 'shaft_temperature(°C)', 'blades_angle(°)', 'windmill_body_temperature(°C)', 'resistance(ohm)', 'rotor_torque(N-m)', 'blade_length(m)']

for i in col_out:
    nan_replace_outlier(i, -99)
    
nan_replace_outlier('windmill_body_temperature(°C)', -999)

## Handle Missing Values

Replacing the missing values of numeric features with `mean`, and `mode` for categorical features.

In [None]:
for col in num_features:
    train[col].fillna(value=train[col].mean(),inplace=True)
    test[col].fillna(value=test[col].mean(),inplace=True)
    
for col in cat_features:
    train[col].fillna(value=train[col].mode()[0],inplace=True)
    test[col].fillna(value=test[col].mode()[0],inplace=True)
    
train['windmill_generated_power(kW/h)'].fillna(value=train['windmill_generated_power(kW/h)'].mode()[0],inplace=True)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

## Convert Datetime

Year and second are no that important in this case. 

In [None]:
train['datetime'] = pd.to_datetime(train['datetime'], format='%Y/%m/%d %H:%M:%S')
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute

In [None]:
test['datetime'] = pd.to_datetime(test['datetime'], format='%Y/%m/%d %H:%M:%S')
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['minute'] = test['datetime'].dt.minute

In [None]:
train.info()

In [None]:
plt.figure(figsize=(12,5))
month_power=train.iloc[:train.shape[0],:].astype({'month':str}).groupby('month',axis=0)['windmill_generated_power(kW/h)'].max()
plt.plot(month_power.index.values,month_power)
plt.xlabel('Month')
plt.ylabel('windmill_generated_power(kW/h)')

In [None]:
plt.figure(figsize=(12,5))
day_power=train.iloc[:train.shape[0],:].astype({'day':str}).groupby('day',axis=0)['windmill_generated_power(kW/h)'].max()
plt.plot(day_power.index.values,day_power)
plt.xlabel('Day')
plt.ylabel('windmill_generated_power(kW/h)')

In [None]:
plt.figure(figsize=(12,5))
hour_power=train.iloc[:train.shape[0],:].astype({'hour':str}).groupby('hour',axis=0)['windmill_generated_power(kW/h)'].max()
plt.plot(day_power.index.values,day_power)
plt.xlabel('Hour')
plt.ylabel('windmill_generated_power(kW/h)')

## Encoding On Categorical Features

Label encoding for cloud_level

In [None]:
le = LabelEncoder()
le.fit(['Medium', 'Low', 'Extremely Low'])
train['cloud_level'] = le.transform(train['cloud_level'])
test['cloud_level'] = le.transform(test['cloud_level'])

Get_dummies for turbine_status

In [None]:
train_dum = pd.get_dummies(train['turbine_status'])
test_dum = pd.get_dummies(test['turbine_status'])

train = pd.concat([train,train_dum],axis=1)
test = pd.concat([test,test_dum],axis=1)

In [None]:
train.drop(['turbine_status'], axis=1, inplace=True)
test.drop(['turbine_status'], axis=1, inplace=True)

train.info()

In [None]:
corr = train.corr()
plt.figure(figsize=(25,25))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True,cbar=False,cmap='Blues')
plt.show()

# Modeling

In [None]:
train.drop(['tracking_id','datetime'], axis=1, inplace=True)
test.drop(['tracking_id','datetime'], axis=1, inplace=True)

In [None]:
Y = train['windmill_generated_power(kW/h)']
X = train.drop(['windmill_generated_power(kW/h)'],axis=1)

X_test = test

print(X.shape,Y.shape)
print(X_test.shape)

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.86,random_state=42)

print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

## LinearRegression

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)
y_train_pred = lr_model.predict(x_train)
y_test_pred = lr_model.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

## Ridge

In [None]:
ridge_model = RidgeCV(scoring="r2", alphas=[0.0001,0.0005,0.001,0.005,0.01,0.1,1.0,10],cv=5)
ridge_model.fit(x_train,y_train)
y_train_pred = ridge_model.predict(x_train)
y_test_pred = ridge_model.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

## Lasso

In [None]:
lasso_model = LassoCV(alphas=[0.0001,0.0005,0.001,0.005,0.01,0.1,1.0,10],cv=5)
lasso_model.fit(x_train,y_train)
y_train_pred = lasso_model.predict(x_train)
y_test_pred = lasso_model.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

## ElasticNet

In [None]:
enet_model = ElasticNetCV(l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], alphas = [1, 0.1, 0.01, 0.001, 0.0005], cv=5)
enet_model.fit(x_train, y_train)
y_train_pred = enet_model.predict(x_train)
y_test_pred = enet_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## AdaBoostRegressor

In [None]:
ab_model = AdaBoostRegressor()
ab_model.fit(x_train,y_train)
y_train_pred = ab_model.predict(x_train)
y_test_pred = ab_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train,y_train)
y_train_pred = rf_model.predict(x_train)
y_test_pred = rf_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## GradientBoostingRegressor

In [None]:
gb_model = GradientBoostingRegressor(criterion='mse',random_state=0,max_depth=5, n_estimators=500,min_samples_split=2,min_samples_leaf=2)
gb_model.fit(x_train,y_train)
y_train_pred = gb_model.predict(x_train)
y_test_pred = gb_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## ExtraTreesRegressor

In [None]:
extra_model = ExtraTreesRegressor(criterion='mse', random_state=0, n_jobs=-1, min_samples_leaf=1, max_depth=20, min_samples_split=3, n_estimators=1000)
extra_model.fit(x_train, y_train)
y_train_pred = extra_model.predict(x_train)
y_test_pred = extra_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## SVR

In [None]:
svr_model = SVR()
svr_model.fit(x_train, y_train)
y_train_pred = svr_model.predict(x_train)
y_test_pred = svr_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## LGBMRegressor

In [None]:
lgbm_model = LGBMRegressor(learning_rate=0.05, max_depth=6,n_estimators=300,num_leaves=30)
lgbm_model.fit(x_train, y_train)
y_train_pred = lgbm_model.predict(x_train)
y_test_pred = lgbm_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## XGBRegressor

In [None]:
xgb_model = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
xgb_model.fit(x_train,y_train)
y_train_pred = xgb_model.predict(x_train)
y_test_pred = xgb_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

## Mix Model

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1) 

In [None]:
averaged_model = AveragingModels(models = (rf_model,gb_model,extra_model,lgbm_model,xgb_model))
averaged_model.fit(x_train, y_train)
y_train_pred = averaged_model.predict(x_train)
y_test_pred = averaged_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

# Submission

In [None]:
result = averaged_model.predict(X_test)

In [None]:
test_final = df_test[['tracking_id','datetime']]

In [None]:
x = pd.DataFrame(test_final)
x.loc[:,'windmill_generated_power(kW/h)'] = result

In [None]:
test_final.head()

In [None]:
print(test_final.shape)

In [None]:
test_final.to_csv('./sample_submission.csv',header=True,index=False)