In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
#Installing windrose to have the wind direction overview
!pip install windrose
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(11, 4)})
from windrose import WindroseAxes

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Importing the data needed for the analysis into panda dataframe

df = pd.read_csv('../input/wind-turbine-scada-dataset/T1.csv')

In [None]:
#checking the first 5 set of data in the dataframe
df.head()

In [None]:
#checking if the dataframe contains null
df.isna().sum()

In [None]:
#Covert Data/time to index and drop columns Date/Time
df.index=df['Date/Time']
df.drop(['Date/Time'], axis=1, inplace=True)

In [None]:
#New DataFrame after dropping column Date/Time
df.head()

In [None]:
#plotting each data
cols_plot = ['LV ActivePower (kW)', 'Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)','Wind Direction (°)']
axes = df[cols_plot].plot(marker='.', alpha=0.5, linestyle='None', figsize=(11, 9), subplots=True)

In [None]:
# Plot the data distributions
plt.figure(figsize=(10, 8))
for i in range(4):
    plt.subplot(2, 2, i+1)
    sns.kdeplot(df.iloc[:,i], shade=True)
    plt.title(df.columns[i])
plt.tight_layout()
plt.show()

In [None]:
# Create wind speed and direction variables
ax = WindroseAxes.from_ax()
ax.bar(df['Wind Direction (°)'], df['Wind Speed (m/s)'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()

The wind rose plot above shows that the wind direction is mostly from the north east while some significant wind also come from the south-west.

In [None]:
#Checking for maximum and minimum value of the wind direction to help in choosing the right binning value
print(df['Wind Direction (°)'].max())
print(df['Wind Direction (°)'].min())

In [None]:
#Bining the data by the wind direction
bins_range = np.arange(0,375,45)

In [None]:
print(bins_range)

In [None]:
#Write a short code to map the bins data
def binning(x, bins):
    kwargs = {}
    if x == max(bins):
        kwargs['right'] = True
    bin = bins[np.digitize([x], bins, **kwargs)[0]]
    bin_lower = bins[np.digitize([x], bins, **kwargs)[0]-1]
    return '[{0}-{1}]'.format(bin_lower, bin)

In [None]:
df['Bin'] = df['Wind Direction (°)'].apply(binning, bins=bins_range)

In [None]:
#group the binned data by mean and std
grouped = df.groupby('Bin')
grouped_std = grouped.std()
grouped_mean = grouped.mean()
grouped_mean.head()

The analysis above shows that highest avearge wind speed was recorded around 180(°)-225(°). 

Contrary to the opinion once had from the windrose plot, south - southwest shows good site for wind turbine because it has the highest avearge wind speed. The region also also has highest theoretical power amd LV active power.

In [None]:
#Checking for maximum and minimum value of the windspeed to help in choosing the right binning value
print(df['Wind Speed (m/s)'].max())
print(df['Wind Speed (m/s)'].min())

In [None]:
#Bining the data by the wind direction
bins_range_ws = np.arange(0,26,0.5)

In [None]:
df['Bin'] = df['Wind Speed (m/s)'].apply(binning, bins=bins_range_ws)

In [None]:
#Group by windspeed bin
grouped = df.groupby('Bin')
grouped_std = grouped.std()
grouped_mean = grouped.mean()
grouped_mean

In [None]:
#lets rearrange the index for proper visualisation
step = bins_range_ws[1]-bins_range_ws[0]
new_index = ['[{0}-{1}]'.format(x, x+step) for x in bins_range_ws]
new_index.pop(-1) #We dont need [360-375]...
grouped_mean = grouped_mean.reindex(new_index)

In [None]:
#Rearranged and visulaizing the mean of each windspeed bin 
grouped_mean

Looking at the table above, it can be assumed that the cut-in wind speed is 3.0-3.5 (m/s), rated wind speed is 12.5-13.0 (m/s) and cut-out wind speed is around 25(m/s). This analysis will be us to determine better filter condition in the power curve analysis.

In [None]:
#Power Curve Anaylsis
#Theoretical power curve
plt.scatter(df['Wind Speed (m/s)'],df['Theoretical_Power_Curve (KWh)'])
plt.ylabel('Theoretical_Power (KWh)')
plt.xlabel('Wind speed (m/s)')
plt.grid(True)
plt.legend([' Theoretical_Power_Curve'], loc='upper left')
plt.show()

In [None]:
# LV ActivePower (kW) CP_CURVE
plt.scatter(df['Wind Speed (m/s)'],df['LV ActivePower (kW)'])
plt.ylabel('LV ActivePower (kW)')
plt.xlabel('Wind speed (m/s)')
plt.grid(True)
plt.legend([' LV ActivePower (kW) CP_CURVE'], loc='upper left')
plt.show()

Using the information gathered above, we can now set a filter condition for our LV ActivePower (kW) power curve

In [None]:
#Condition 1
#The first step is the removal of downtime events, which can be identified as near-zero power at high wind speeds.

new = (df[(df['Wind Speed (m/s)'] < 4.5) | (df['LV ActivePower (kW)'] > 100.0) ])

In [None]:
#Condition 2
new_1 = (new[ (new['Wind Speed (m/s)'] < 12.5)  | (new['LV ActivePower (kW)'] >= 3000) ])

In [None]:
#Condition 3
new_2 = (new_1[ (new_1['Wind Speed (m/s)'] < 9.5)  | (new_1['LV ActivePower (kW)'] >= 1500) ])

In [None]:
#Theoretical_Power_Curve and Filtered LV ActivePower (kW) CP_CURVE Visualisation
plt.scatter(new_2['Wind Speed (m/s)'],new_2['LV ActivePower (kW)'])
plt.scatter(df['Wind Speed (m/s)'],df['Theoretical_Power_Curve (KWh)'], label='Theoretical_Power_Curve (KWh)')
plt.ylabel('Power (kW)')
plt.xlabel('Wind speed (m/s)')
plt.grid(True)
plt.legend(['Theoretical_Power_Curve and Filtered LV ActivePower (kW) CP_CURVE'], loc='upper left')
plt.show()

The filtered power curve can still be improved. You can suggest best filter condition. 