# Predicting Wind Turbine Output with a Random Forest

In [9]:
'''
Read in the wind turbine data, store them in a Pandas dataframe, 
do some cleanup, and compute basic statistics.
The input csv file contains the following fields:

ID:         Unique ID for each observation
ZONEID:     Zone/Turbine ID, a number between 1 and 10
TIMESTAMP:  Time of observation, in the format "YYYYMMDD h:mm" or "YYYYMMDD hh:mm"
TARGETVAR:  Wind turbine output
U10:        Zonal Wind Vector at 10 m
V10:        Meridional Wind Vector at 10 m
U100:       Zonal Wind vector at 100 m
V100:       Meridional Wind vector at 100 m

'''
import pandas as pd
import numpy as np

# Function to check the training data and make some conversions
def transformDF(row):
    outrow    = [0]*10
    outrow[0] = row[0]
    outrow[1] = row[1]
    year      = int(row[2][:4])
    assert year>=2012 and year<=2013, "Invalid year %i" %year
    month     = int(row[2][4:6])
    assert month>=1 and month<=12, "Invalid month %i" %month
    days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
    if year%4 == 0: days_per_month[1] = 29
    day       = int(row[2][6:8])
    assert day>=1 and day<=days_per_month[month-1], "Invalid day of month %i" %day
    day_of_year = sum([days for days in days_per_month[:month-1]]) + day
    hour      = int(row[2][9:].split(":")[0])
    assert hour>=0 and hour<24, "Invalid hour %i" %hour
    minute    = int(row[2][9:].split(":")[1])
    assert minute>=0 and minute<60, "Invalid minute %i" %minute
    outrow[2] = year
    outrow[3] = day_of_year
    outrow[4] = hour
    outrow[5] = row[4]
    outrow[6] = row[5]
    outrow[7] = row[6]
    outrow[8] = row[7]
    outrow[9] = row[3]
    return outrow


df0         = pd.read_table("data/Train_O4UPEyW.csv", sep=',', header=0)
df0['new1'] = df0['V100']
df0['new2'] = df0['V100']
df1         = df0.apply(transformDF, axis=1, broadcast=False, raw=True, reduce=None)
df1.rename(columns={'ID':'Id','ZONEID':'ZoneId','TIMESTAMP':'Year','TARGETVAR':'DayOfYear',
                    'U10':'Hour','V10':'U10','U100':'V10','V100':'U100','new1':'V100',
                    'new2':'Target'},inplace=True)
df1['Id']        = df1['Id'].astype(int)
df1['ZoneId']    = df1['ZoneId'].astype(int).astype("category")
df1['Year']      = df1['Year'].astype(int)
df1['DayOfYear'] = df1['DayOfYear'].astype(int)
df1['Hour']      = df1['Hour'].astype(int)

print('\nDataframe df0:\n%s' %df0.head())
print('\nDataframe df1:\n%s' %df1.head())
print('\nData Types in df1:\n%s' %df1.dtypes)
print('\nLength of df0: %i, df1: %i' %(len(df0),len(df1)))


Dataframe df0:
         ID  ZONEID      TIMESTAMP  TARGETVAR       U10       V10      U100  \
0  11010101       1  20120101 1:00   0.000000  2.124600 -2.681966  2.864280   
1  11010201       1  20120101 2:00   0.054879  2.521695 -1.796960  3.344859   
2  11010301       1  20120101 3:00   0.110234  2.672210 -0.822516  3.508448   
3  11010401       1  20120101 4:00   0.165116  2.457504 -0.143642  3.215233   
4  11010501       1  20120101 5:00   0.156940  2.245898  0.389576  2.957678   

       V100      new1      new2  
0 -3.666076 -3.666076 -3.666076  
1 -2.464761 -2.464761 -2.464761  
2 -1.214093 -1.214093 -1.214093  
3 -0.355546 -0.355546 -0.355546  
4  0.332701  0.332701  0.332701  

Dataframe df1:
         Id ZoneId  Year  DayOfYear  Hour       U10       V10      U100  \
0  11010101      1  2012          1     1  2.124600 -2.681966  2.864280   
1  11010201      1  2012          1     2  2.521695 -1.796960  3.344859   
2  11010301      1  2012          1     3  2.672210 -0.822516  3

In [10]:
'''
Convert categorical variables in dataframe.
'''
df2 = pd.get_dummies(df1,drop_first=True)
df2 = df2.drop('Id',1)
print('\nDataframe df2:\n%s' %df2.head())
print('\nData Types in df2:\n%s' %df2.dtypes)


Dataframe df2:
   Year  DayOfYear  Hour       U10       V10      U100      V100    Target  \
0  2012          1     1  2.124600 -2.681966  2.864280 -3.666076  0.000000   
1  2012          1     2  2.521695 -1.796960  3.344859 -2.464761  0.054879   
2  2012          1     3  2.672210 -0.822516  3.508448 -1.214093  0.110234   
3  2012          1     4  2.457504 -0.143642  3.215233 -0.355546  0.165116   
4  2012          1     5  2.245898  0.389576  2.957678  0.332701  0.156940   

   ZoneId_2  ZoneId_3  ZoneId_4  ZoneId_5  ZoneId_6  ZoneId_7  ZoneId_8  \
0       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
2       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
3       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
4       0.0       0.0       0.0       0.0       0.0       0.0       0.0   

   ZoneId_9  ZoneId_10  
0       0.0        0.0  
1       0.0   

In [76]:
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Convert pandas dataframe to its numpy representation, separating out features from target variable.
features = df2.columns.values.tolist()
features.remove("Target")
X = df2.as_matrix(columns=features)
Y = np.array(df2["Target"].tolist())

# Create training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.106, random_state=0)

rfr = RandomForestRegressor(n_estimators=1000, criterion="mse", max_depth=30, min_samples_split=2, 
                      min_samples_leaf=5, max_features="sqrt", max_leaf_nodes=None, bootstrap=True,
                      oob_score=True, n_jobs=-1, random_state=0, verbose=0)

# Fit to (X,Y) for final submission.
%time rfr.fit(X, Y)

CPU times: user 6min 56s, sys: 4.58 s, total: 7min 1s
Wall time: 2min 13s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=True, random_state=0,
           verbose=0, warm_start=False)

In [75]:
'''
Examine fit results.
'''
FeatureImportances = [[feature, importance] for (feature,importance) in zip(features,rfr.feature_importances_)]
print('Feature Importances:\n')
for fi in sorted(FeatureImportances, key=lambda x: x[1], reverse=True):
    print('   %10s: %f' %(fi[0],fi[1]))

y_true, y_pred = y_test, rfr.predict(X_test)
rmse_test = np.sqrt(sum([(yt-yp)**2 for (yt,yp) in zip(y_true,y_pred)])/len(y_true))
print('\nRMSE on test data set: %s\n' %rmse_test)
print('Score of train data set: %s' %rfr.score(X_train,y_train))
print('Score of test data set:  %s\n' %rfr.score(X_test,y_test))

Feature Importances:

         V100: 0.256210
         U100: 0.234857
          V10: 0.175593
          U10: 0.166792
    DayOfYear: 0.041393
         Hour: 0.036123
    ZoneId_10: 0.019499
     ZoneId_3: 0.012050
     ZoneId_6: 0.010575
     ZoneId_5: 0.009509
     ZoneId_8: 0.007270
     ZoneId_4: 0.007185
     ZoneId_7: 0.006986
     ZoneId_2: 0.006331
         Year: 0.004897
     ZoneId_9: 0.004729

RMSE on test data set: 0.112389233368

Score of train data set: 0.866235491745
Score of test data set:  0.863619566853



In [77]:
'''
Prepare test data set in the same way as the training set, make predictions, and create output csv file.
'''
import csv

# Function to check the test data and make some conversions
def transformDFtt(row):
    outrow    = [0]*9
    outrow[0] = row[0]
    outrow[1] = row[1]
    year      = int(row[2][:4])
    assert year>=2012 and year<=2013, "Invalid year %i" %year
    month     = int(row[2][4:6])
    assert month>=1 and month<=12, "Invalid month %i" %month
    days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
    if year%4 == 0: days_per_month[1] = 29
    day       = int(row[2][6:8])
    assert day>=1 and day<=days_per_month[month-1], "Invalid day of month %i" %day
    day_of_year = sum([days for days in days_per_month[:month-1]]) + day
    hour      = int(row[2][9:].split(":")[0])
    assert hour>=0 and hour<24, "Invalid hour %i" %hour
    minute    = int(row[2][9:].split(":")[1])
    assert minute>=0 and minute<60, "Invalid minute %i" %minute
    outrow[2] = year
    outrow[3] = day_of_year
    outrow[4] = hour
    outrow[5] = row[3]
    outrow[6] = row[4]
    outrow[7] = row[5]
    outrow[8] = row[6]
    return outrow


ttdf0         = pd.read_table("data/Test_uP7dymh.csv", sep=',', header=0)
ttdf0['new1'] = ttdf0['V100']
ttdf0['new2'] = ttdf0['V100']

ttdf1         = ttdf0.apply(transformDFtt, axis=1, broadcast=False, raw=True, reduce=None)
ttdf1.rename(columns={'ID':'Id','ZONEID':'ZoneId','TIMESTAMP':'Year','U10':'DayOfYear',
                      'V10':'Hour','U100':'U10','V100':'V10','new1':'U100','new2':'V100'},inplace=True)
ttdf1['Id']        = ttdf1['Id'].astype(int)
ttdf1['ZoneId']    = ttdf1['ZoneId'].astype(int).astype("category")
ttdf1['Year']      = ttdf1['Year'].astype(int)
ttdf1['DayOfYear'] = ttdf1['DayOfYear'].astype(int)
ttdf1['Hour']      = ttdf1['Hour'].astype(int)

ttdf2 = pd.get_dummies(ttdf1,drop_first=True)
ttdf2 = ttdf2.drop('Id',1)

ttid  = ttdf1['Id'].tolist()
ttX   = ttdf2.as_matrix(columns=features)

# Count how many observations we have in August and September 2013
nAugSep2013 = len([DoY for DoY in ttdf1['DayOfYear'] if DoY>212 and DoY<274])
print('\nNumber of observations in August and September 2013: %i\n' %nAugSep2013)

print('\nDataframe df0:\n%s' %ttdf0.head())
print('\nDataframe df1:\n%s' %ttdf1.head())
print('\nData Types in df1:\n%s' %ttdf1.dtypes)
print('\nLength of df0: %i, df1: %i' %(len(ttdf0),len(ttdf1)))
print('\nDataframe df2:\n%s' %ttdf2.head())
print('\nData Types in df2:\n%s' %ttdf2.dtypes)

# Compute predictions for test set.
tty = rfr.predict(ttX)

# Generate output csv file
with open('data/turbineOutputs.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["ID","TARGETVAR"])
    for idval,yval in zip(ttid,tty):
        writer.writerow([idval,yval])


Number of observations in August and September 2013: 14640


Dataframe df0:
         ID  ZONEID      TIMESTAMP       U10       V10      U100      V100  \
0  30010001       1  20130801 0:00  0.566472 -3.866858  0.935726 -5.888026   
1  30010101       1  20130801 1:00  2.513531 -4.475081  3.426426 -6.415826   
2  30010201       1  20130801 2:00  2.730166 -4.752177  3.915867 -6.847804   
3  30010301       1  20130801 3:00  2.461083 -4.583656  3.541562 -6.545063   
4  30010401       1  20130801 4:00  2.146012 -4.407169  3.158082 -6.276412   

       new1      new2  
0 -5.888026 -5.888026  
1 -6.415826 -6.415826  
2 -6.847804 -6.847804  
3 -6.545063 -6.545063  
4 -6.276412 -6.276412  

Dataframe df1:
         Id ZoneId  Year  DayOfYear  Hour       U10       V10      U100  \
0  30010001      1  2013        213     0  0.566472 -3.866858  0.935726   
1  30010101      1  2013        213     1  2.513531 -4.475081  3.426426   
2  30010201      1  2013        213     2  2.730166 -4.752177  3.9158