In [1]:
import pandas as pd
import numpy as np
import zipfile

import warnings
warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

from sklearn.datasets import make_circles, load_boston
from sklearn.model_selection import train_test_split as tts

from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor as SGDR

from sklearn.svm import SVR
from sklearn.tree import ExtraTreeRegressor as ETR
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.ensemble import GradientBoostingRegressor as GBR
 


In [2]:
zip = zipfile.ZipFile('solar-energy-prediction-datamex0320.zip')

In [3]:
traindf = pd.read_csv(zip.open('solar_train.csv'))

In [4]:
testdf = pd.read_csv(zip.open('solar_test.csv'))

In [5]:
example = pd.read_csv(zip.open('submission_example.csv'))

In [6]:
traindf.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475175023,9/29/2016 12:00:00 AM,08:50:23,634.99,61,30.46,41,14.96,6.75,06:13:00,18:13:00
1,1481799902,12/15/2016 12:00:00 AM,01:05:02,1.27,37,30.26,70,207.43,5.62,06:50:00,17:46:00
2,1478339417,11/4/2016 12:00:00 AM,23:50:17,1.21,47,30.49,33,168.2,5.62,06:25:00,17:47:00
3,1472887208,9/2/2016 12:00:00 AM,21:20:08,1.67,54,30.46,101,152.6,3.37,06:07:00,18:37:00
4,1478724901,11/9/2016 12:00:00 AM,10:55:01,839.78,62,30.47,36,291.95,7.87,06:28:00,17:45:00


In [7]:
testdf.head()

Unnamed: 0,id,UNIXTime,Data,Time,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,0,1478720107,11/9/2016 12:00:00 AM,09:35:07,59,30.47,44,312.67,3.37,06:28:00,17:45:00
1,1,1474063503,9/16/2016 12:00:00 AM,12:05:03,59,30.48,83,38.01,6.75,06:10:00,18:25:00
2,2,1476109221,10/10/2016 12:00:00 AM,04:20:21,47,30.39,78,213.62,5.62,06:16:00,18:03:00
3,3,1481475056,12/11/2016 12:00:00 AM,06:50:56,45,30.4,98,176.63,4.5,06:47:00,17:44:00
4,4,1477493117,10/26/2016 12:00:00 AM,04:45:17,45,30.4,34,175.89,6.75,06:21:00,17:52:00


In [8]:
example.head()

Unnamed: 0,id,Radiation
0,0,1041.317337
1,1,1064.230693
2,2,610.828947
3,3,602.742249
4,4,562.312327


In [None]:
traindf.dtypes

In [None]:
testdf.dtypes

In [None]:
example.dtypes

In [None]:
traindf.info()

In [None]:
testdf.info()

In [None]:
example.info()

In [None]:
traindf.describe()

In [None]:
testdf.describe()

In [None]:
example.describe()

In [9]:
testdf.columns

Index(['id', 'UNIXTime', 'Data', 'Time', 'Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed', 'TimeSunRise', 'TimeSunSet'],
      dtype='object')

In [10]:
testdf = testdf.drop(columns = ['UNIXTime', 'TimeSunRise', 'TimeSunSet'])

In [11]:
testdf.columns

Index(['id', 'Data', 'Time', 'Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed'],
      dtype='object')

In [12]:
testdf.dtypes

id                          int64
Data                       object
Time                       object
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
dtype: object

In [13]:
testdf['Data'] = pd.to_datetime(testdf['Data'])

In [14]:
testdf.dtypes

id                                 int64
Data                      datetime64[ns]
Time                              object
Temperature                        int64
Pressure                         float64
Humidity                           int64
WindDirection(Degrees)           float64
Speed                            float64
dtype: object

In [15]:
testdf['Data'] = pd.to_numeric(testdf.Data, errors = 'coerce').astype(float)

In [16]:
testdf.dtypes

id                          int64
Data                      float64
Time                       object
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
dtype: object

In [17]:
testdf['Time'] = pd.to_datetime(testdf['Time'])

In [18]:
testdf['Time'] = pd.to_numeric(testdf.Time, errors = 'coerce').astype(float)

In [19]:
testdf.dtypes

id                          int64
Data                      float64
Time                      float64
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
dtype: object

In [20]:
testdf.Temperature = testdf.Temperature.astype(float)
testdf.Humidity = testdf.Humidity.astype(float)

In [21]:
testdf.dtypes

id                          int64
Data                      float64
Time                      float64
Temperature               float64
Pressure                  float64
Humidity                  float64
WindDirection(Degrees)    float64
Speed                     float64
dtype: object

In [22]:
test = testdf.drop(columns = ['id'])

In [23]:
test.head()

Unnamed: 0,Data,Time,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
0,1.47865e+18,1.588931e+18,59.0,30.47,44.0,312.67,3.37
1,1.473984e+18,1.58894e+18,59.0,30.48,83.0,38.01,6.75
2,1.476058e+18,1.588912e+18,47.0,30.39,78.0,213.62,5.62
3,1.481414e+18,1.588921e+18,45.0,30.4,98.0,176.63,4.5
4,1.47744e+18,1.588913e+18,45.0,30.4,34.0,175.89,6.75


In [24]:
traindf.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475175023,9/29/2016 12:00:00 AM,08:50:23,634.99,61,30.46,41,14.96,6.75,06:13:00,18:13:00
1,1481799902,12/15/2016 12:00:00 AM,01:05:02,1.27,37,30.26,70,207.43,5.62,06:50:00,17:46:00
2,1478339417,11/4/2016 12:00:00 AM,23:50:17,1.21,47,30.49,33,168.2,5.62,06:25:00,17:47:00
3,1472887208,9/2/2016 12:00:00 AM,21:20:08,1.67,54,30.46,101,152.6,3.37,06:07:00,18:37:00
4,1478724901,11/9/2016 12:00:00 AM,10:55:01,839.78,62,30.47,36,291.95,7.87,06:28:00,17:45:00


In [25]:
traindf['Data'] = pd.to_datetime(traindf['Data'])
traindf['Data'] = pd.to_numeric(traindf.Data, errors = 'coerce'). astype(float)

In [26]:
traindf['Time'] = pd.to_datetime(traindf['Time'])
traindf['Time'] = pd.to_numeric(traindf.Time, errors = 'coerce'). astype(float)

In [27]:
traindf = traindf.drop(columns = ['UNIXTime'])
traindf = traindf.drop(columns = ['TimeSunRise'])
traindf = traindf.drop(columns = ['TimeSunSet'])

In [28]:
traindf.Temperature = traindf.Temperature.astype(float)
traindf.Humidity = traindf.Humidity.astype(float)

In [29]:
traindf.shape

(24514, 8)

In [30]:
traindf.corr()

Unnamed: 0,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
Data,1.0,-0.009954,-0.081989,-0.371565,-0.332569,-0.062247,0.157208,0.174429
Time,-0.009954,1.0,0.009963,0.200672,0.089837,0.084771,-0.082111,-0.057691
Radiation,-0.081989,0.009963,1.0,0.736029,0.116763,-0.226432,-0.238388,0.07458
Temperature,-0.371565,0.200672,0.736029,1.0,0.310439,-0.281192,-0.27007,-0.032578
Pressure,-0.332569,0.089837,0.116763,0.310439,1.0,-0.22248,-0.229222,-0.082508
Humidity,-0.062247,0.084771,-0.226432,-0.281192,-0.22248,1.0,0.001631,-0.211095
WindDirection(Degrees),0.157208,-0.082111,-0.238388,-0.27007,-0.229222,0.001631,1.0,0.071282
Speed,0.174429,-0.057691,0.07458,-0.032578,-0.082508,-0.211095,0.071282,1.0


In [31]:
traindf.isnull().sum()

Data                      0
Time                      0
Radiation                 0
Temperature               0
Pressure                  0
Humidity                  0
WindDirection(Degrees)    0
Speed                     0
dtype: int64

In [32]:
X = traindf.drop('Radiation', axis = 1)
y = traindf.Radiation

In [33]:
X_train, X_test, y_train, y_test = tts(X, y)

In [34]:
linreg = LinReg(normalize=True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [35]:
train_score = linreg.score(X_train, y_train)
test_score = linreg.score(X_test, y_test)
print(train_score, test_score)

0.622581640300351 0.6177815745075821


In [38]:
gbr = GBR()
gbr.fit(X_train, y_train)

train_score = gbr.score(X_train, y_train)
test_score = gbr.score(X_test, y_test)

print('train R2:', train_score, '-- test R2:', test_score)

train R2: 0.8932905486157516 -- test R2: 0.8840535995728795


In [39]:
knnr = KNNR(n_neighbors = 7)
knnr.fit(X_train, y_train)

train_score = knnr.score(X_train, y_train) 
test_score = knnr.score(X_test, y_test)

print('train R2:', train_score, '-- test R2:', test_score)

train R2: 0.9568800239748587 -- test R2: 0.941473794872581


In [None]:
from sklearn.metrics import mean_squared_error as Mse

In [None]:
y_pred = gbr.predict(test)

In [40]:
y_pred = knnr.predict(test)

In [41]:
Radiation = pd.DataFrame(y_pred)

In [42]:
Radiation.rename(columns={list(Radiation)[0]:'Radiation'}, inplace=True)
Radiation.columns

Index(['Radiation'], dtype='object')

In [43]:
df = pd.DataFrame(columns=[testdf['id'], Radiation['Radiation']])
df.head()

id,0,1,2,3,4,5,6,7,8,9,...,8162,8163,8164,8165,8166,8167,8168,8169,8170,8171
Radiation,648.957143,975.930000,1.227143,10.392857,1.210000,536.008571,1.284286,1.232857,628.352857,129.708571,...,1.224286,3.112857,831.054286,1.205714,1.235714,505.945714,1.231429,13.090000,1.217143,894.887143


In [44]:
df = df.T

In [45]:
df

id,Radiation
0,648.957143
1,975.930000
2,1.227143
3,10.392857
4,1.210000
...,...
8167,505.945714
8168,1.231429
8169,13.090000
8170,1.217143


In [46]:
df.reset_index(inplace=True)
df

Unnamed: 0,id,Radiation
0,0,648.957143
1,1,975.930000
2,2,1.227143
3,3,10.392857
4,4,1.210000
...,...,...
8167,8167,505.945714
8168,8168,1.231429
8169,8169,13.090000
8170,8170,1.217143


In [47]:
df.to_csv('predicts2.csv', index=False)

In [48]:
df1 = pd.read_csv('predicts2.csv')
df1

Unnamed: 0,id,Radiation
0,0,648.957143
1,1,975.930000
2,2,1.227143
3,3,10.392857
4,4,1.210000
...,...,...
8167,8167,505.945714
8168,8168,1.231429
8169,8169,13.090000
8170,8170,1.217143


In [None]:
knnr = KNNR(n_neighbors = 7)
knnr.fit(X_train, y_train)

train_score = knnr.score(X_train, y_train) 
test_score = knnr.score(X_test, y_test)

print('train R2:', train_score, '-- test R2:', test_score)

In [None]:
y2_pred = knnr.predict(test)

In [None]:
Radiation2 = pd.DataFrame(y2_pred)

In [None]:
Radiation2.rename(columns={list(Radiation2)[0]:'Radiation'}, inplace=True)
Radiation2.columns

In [None]:
df2 = pd.DataFrame(columns=[testdf['id'], Radiation['Radiation']])
df2.head()

In [None]:
df2 = df2.T

In [None]:
df2

In [None]:
df2.reset_index(inplace=True)
df2

In [None]:
df2 = df2.drop(columns = ['index']) 

In [None]:
df2

In [None]:
df1

In [None]:
df2.to_csv('predicts.csv', index=False)

In [None]:
df3 = pd.read_csv('predicts.csv')
df3