In [1]:
import pandas as pd
import numpy as np
import random

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

In [2]:
def create_linear_model(x, y):
    model = linear_model.LinearRegression()
    model.fit(x, y)
    print('Coef:', model.coef_)
    print('Intercept:', model.intercept_)
    print('r2:', r2_score(y, model.predict(x)))
    return model

In [3]:
df = pd.read_csv('INPI.csv', sep=';', decimal=',')

In [4]:
df.head()

Unnamed: 0,ANO,MES,DIA,HORA,TempAr
0,2009,1,1,0,26.5
1,2009,1,1,300,26.0
2,2009,1,1,600,25.5
3,2009,1,1,900,24.5
4,2009,1,1,1200,28.0


In [5]:
def create_plus_3_minus_3_features(df, columns='TempAr'):
    for i in range(1,4):
        t_menos_1 = 'temp_t_{}'.format(i)
        df[t_menos_1] = df[columns].shift(i)
        t_mais_1 = 'temp_tm{}'.format(i)
        df[t_mais_1] = df[columns].shift(-i)

In [6]:
def create_month_hour_features(df, columns='HORA'):
    for i in range(3,22,3):
        desc_h = 'h{}'.format(i*100)
        df[desc_h] = df[columns].apply(lambda x: 1 if x == i*100 else 0)
    for i in range(1,12):
        desc_m = 'm{}'.format(i)
        df[desc_m] = df.MES.apply(lambda x: 1 if x == i else 0)

In [7]:
create_plus_3_minus_3_features(df)
create_month_hour_features(df)
df.head()

Unnamed: 0,ANO,MES,DIA,HORA,TempAr,temp_t_1,temp_tm1,temp_t_2,temp_tm2,temp_t_3,...,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11
0,2009,1,1,0,26.5,,26.0,,25.5,,...,0,0,0,0,0,0,0,0,0,0
1,2009,1,1,300,26.0,26.5,25.5,,24.5,,...,0,0,0,0,0,0,0,0,0,0
2,2009,1,1,600,25.5,26.0,24.5,26.5,28.0,,...,0,0,0,0,0,0,0,0,0,0
3,2009,1,1,900,24.5,25.5,28.0,26.0,30.0,26.5,...,0,0,0,0,0,0,0,0,0,0
4,2009,1,1,1200,28.0,24.5,30.0,25.5,33.5,26.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_clean = df.dropna()

In [9]:
features_col = df_clean.columns[5:]
features_col

Index(['temp_t_1', 'temp_tm1', 'temp_t_2', 'temp_tm2', 'temp_t_3', 'temp_tm3',
       'h300', 'h600', 'h900', 'h1200', 'h1500', 'h1800', 'h2100', 'm1', 'm2',
       'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11'],
      dtype='object')

In [10]:
x = df_clean[features_col].values

In [11]:
y = df_clean.TempAr.values.reshape(-1,1)

In [12]:
model_a = create_linear_model(x, y)

Coef: [[ 0.4239627   0.42548433  0.01394948 -0.00756265 -0.00674852 -0.00647283
   2.56432431  2.32529206  1.60977293  1.36144786  4.65997421  5.06012275
   5.53154187  0.12680418  0.06720489  0.01427058  0.03094921 -0.09795508
  -0.58584962 -0.35924795 -0.05948228  0.10192722  0.29717947  0.21543466]]
Intercept: [1.23305537]
r2: 0.7924057732761738


In [13]:
erro = y - model_a.predict(x)
df_clean['e'] = erro

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
def create_error_feature(df):
    for i in range(1,2):
        e_menos_1 = 'e_{}'.format(i)
        df[e_menos_1] = df.e.shift(i)
        e_mais_1 = 'em{}'.format(i)
        df[e_mais_1] = df.e.shift(-i)

In [15]:
create_error_feature(df_clean)
df_clean = df_clean.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
df_clean.drop(columns='e', inplace=True)
df_clean.head()

Unnamed: 0,ANO,MES,DIA,HORA,TempAr,temp_t_1,temp_tm1,temp_t_2,temp_tm2,temp_t_3,...,m4,m5,m6,m7,m8,m9,m10,m11,e_1,em1
4,2009,1,1,1200,28.0,24.5,30.0,25.5,33.5,26.0,...,0,0,0,0,0,0,0,0,-0.934374,-1.877967
5,2009,1,1,1500,30.0,28.0,33.5,24.5,35.0,25.5,...,0,0,0,0,0,0,0,0,2.426724,-0.380885
6,2009,1,1,1800,33.5,30.0,35.0,28.0,26.5,24.5,...,0,0,0,0,0,0,0,0,-1.877967,2.770236
7,2009,1,1,2100,35.0,33.5,26.5,30.0,27.0,28.0,...,0,0,0,0,0,0,0,0,-0.380885,-1.096815
8,2009,1,2,0,26.5,35.0,27.0,33.5,25.5,30.0,...,0,0,0,0,0,0,0,0,2.770236,1.108812


In [17]:
features_col = df_clean.columns[5:]
features_col

Index(['temp_t_1', 'temp_tm1', 'temp_t_2', 'temp_tm2', 'temp_t_3', 'temp_tm3',
       'h300', 'h600', 'h900', 'h1200', 'h1500', 'h1800', 'h2100', 'm1', 'm2',
       'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'e_1', 'em1'],
      dtype='object')

In [18]:
x1 = df_clean[features_col].values
y1 = df_clean.TempAr.values.reshape(-1,1)

**Modelo: T(+1,+2,3,-1,-2,-3)e(+1,-1)**

In [19]:
model_b = create_linear_model(x1,y1)

Coef: [[ 1.17233939e+00  1.07167509e+00 -4.99166815e-01 -4.57956603e-01
   1.14811986e-03  2.74559791e-02  3.15389305e+00  2.65756295e+00
   4.67208613e+00 -1.35207032e+00  1.62662066e+00 -3.62628062e+00
   7.46502381e-01 -2.83205625e-01 -1.30825959e-01 -2.51142181e-02
  -3.44485532e-02  1.85153973e-01  1.19585092e+00  5.83327512e-01
   6.47707450e-02 -2.09924449e-01 -5.94190966e-01 -4.26435852e-01
  -1.17775325e+00 -1.06750803e+00]]
Intercept: [-9.25177253]
r2: 0.9795167201258269


In [20]:
def create_model(df, columns):
    x = df[columns].values
    y = df.TempAr.values.reshape(-1,1)
    return create_linear_model(x, y)

**Modelo: T(+1, +2,-1,-2,-3) e(+1,-1)**

In [21]:
features_col_c = [col for col in features_col if col not in ['temp_tm3']]
model_c = create_model(df_clean, features_col_c)

Coef: [[ 1.18359154e+00  1.05813662e+00 -5.03339512e-01 -4.36052446e-01
   4.33027166e-03  3.26821800e+00  2.84322596e+00  4.75256813e+00
  -1.25683184e+00  1.60564452e+00 -3.61060640e+00  7.43295961e-01
  -2.72165291e-01 -1.25268730e-01 -2.35270030e-02 -3.16536290e-02
   1.77563494e-01  1.16732181e+00  5.67280954e-01  5.88739261e-02
  -2.02886851e-01 -5.75637614e-01 -4.12623408e-01 -1.18760856e+00
  -1.05424427e+00]]
Intercept: [-9.08058007]
r2: 0.9792861331125733


**Modelo: T(+1,-1,-2,-3) e(+1,-1)**

In [22]:
features_col_c = [col for col in features_col if col not in ['temp_tm3', 'temp_tm2']]
model_d = create_model(df_clean, features_col_c)

Coef: [[ 1.78501019e+00  2.91963755e-01 -7.62095590e-01  8.27736908e-04
   3.85083500e+00  5.05723842e+00  7.77322282e+00  2.33773845e+00
   7.74534295e-01 -1.14551956e+00  5.30626783e+00 -2.84140500e-01
  -1.34508020e-01 -2.46501060e-02 -5.48132788e-02  1.87788402e-01
   1.18370415e+00  6.38366369e-01  6.38021269e-02 -2.41322577e-01
  -5.94748960e-01 -4.28822220e-01 -1.78016899e+00 -2.79227766e-01]]
Intercept: [-11.26440355]
r2: 0.9636606492758943


**Modelo: T(+1, -1,-2,-3) e(-1)**

In [23]:
features_col_c = [col for col in features_col if col not in ['temp_tm3', 'temp_tm2', 'em1']]
model_e = create_model(df_clean, features_col_c)

Coef: [[ 1.99496957  0.10899807 -0.84861127  0.00970545  5.00632301  6.11698222
   8.9600348   4.2182304   0.37966986 -0.75069698  7.06559101 -0.22545756
  -0.10938157 -0.01864733 -0.04818042  0.14519691  1.0074665   0.56201241
   0.02291646 -0.21548694 -0.49722359 -0.35663923 -1.98786959]]
Intercept: [-10.82169763]
r2: 0.9580958839513823


**Modelo: T(-1,-2,-3) e(-1)**

In [24]:
features_col_c = [col for col in features_col if col not in ['temp_tm3', 'temp_tm2', 'em1', 'temp_tm1']]
model_f = create_model(df_clean, features_col_c)

Coef: [[ 2.12595331e+00 -9.06938379e-01  1.93608079e-02  5.68152690e+00
   6.72263791e+00  9.66826398e+00  5.33489605e+00  1.29873080e-01
  -5.76600109e-01  8.09428035e+00 -1.95603099e-01 -9.36090751e-02
  -1.57787416e-02 -4.01184847e-02  1.19695612e-01  9.22077326e-01
   5.18781760e-01  9.22056513e-03 -2.01045778e-01 -4.47412588e-01
  -3.17127121e-01 -2.11601469e+00]]
Intercept: [-10.63325164]
r2: 0.954341767074941


**Modelo: T(+1, +2,+3,-1,-2,) e(+1,-1)**

In [25]:
features_col_c = [col for col in features_col if col not in ['temp_t_3']]
model_g = create_model(df_clean, features_col_c)

Coef: [[ 1.17212323  1.0718757  -0.49842395 -0.45802647  0.0275854   3.15021121
   2.65986027  4.6734218  -1.35760573  1.62253479 -3.62959351  0.74170286
  -0.28283646 -0.1306919  -0.02505572 -0.03449029  0.18492147  1.1944007
   0.58242475  0.06455263 -0.20967607 -0.59364985 -0.42590261 -1.17756428
  -1.06766337]]
Intercept: [-9.24007535]
r2: 0.979516327970856


**Modelo: T(+1, +2,+3,-1,) e(+1,-1) **

In [26]:
features_col_c = [col for col in features_col if col not in ['temp_t_3', 'temp_t_2']]
model_h = create_model(df_clean, features_col_c)

Coef: [[ 0.30245248  1.71196469 -0.73835747  0.03701152  2.35120339  0.43003397
   0.54816768 -4.47568941  3.51884462 -3.38472991 -2.66234007 -0.30311404
  -0.13295083 -0.0360171  -0.01649147  0.19006063  1.18713354  0.531881
   0.09631595 -0.1853193  -0.581638   -0.43503153 -0.3086326  -1.70888871]]
Intercept: [-7.73815768]
r2: 0.9608491900323092


**Modelo: T(+1, +2,+3,-1) e(+1)**

In [27]:
features_col_c = [col for col in features_col if col not in ['temp_t_3', 'temp_t_2', 'e_1']]
model_i = create_model(df_clean, features_col_c)

Coef: [[ 9.48959626e-02  1.93978771e+00 -8.35532187e-01  5.18948258e-02
   9.88941568e-01 -1.22739398e+00 -7.61510799e-01 -6.63728291e+00
   3.79031878e+00 -4.45937099e+00 -4.78973392e+00 -2.43237138e-01
  -1.05205026e-01 -2.32316769e-02 -3.64376902e-03  1.56920999e-01
   9.50142406e-01  3.69621685e-01  7.95091887e-02 -1.26636947e-01
  -4.74957477e-01 -3.41734970e-01 -1.94036118e+00]]
Intercept: [-4.93885922]
r2: 0.9540170739143107


**Modelo: T(+1, +2,+3) e(+1) **

In [28]:
features_col_c = [col for col in features_col if col not in ['temp_t_3', 'temp_t_2', 'e_1', 'temp_t_1']]
model_j = create_model(df_clean, features_col_c)

Coef: [[ 2.05330558e+00 -8.86605496e-01  6.25870957e-02  3.41797162e-01
  -2.01921987e+00 -1.37190093e+00 -7.67876914e+00  3.90161566e+00
  -5.03215973e+00 -5.83720686e+00 -2.20729624e-01 -9.46509162e-02
  -1.62412362e-02  3.68457155e-03  1.45137554e-01  8.69556244e-01
   3.09813412e-01  7.73517890e-02 -1.03236564e-01 -4.40386234e-01
  -3.04922841e-01 -2.05436356e+00]]
Intercept: [-3.79641431]
r2: 0.9513268470264369


In [29]:
import xgboost as xgb

In [30]:
model_xgb = xgb.XGBRegressor()

In [31]:
df_clean2 = df.dropna()
features_col2 = df_clean2.columns[5:]
features_col2

Index(['temp_t_1', 'temp_tm1', 'temp_t_2', 'temp_tm2', 'temp_t_3', 'temp_tm3',
       'h300', 'h600', 'h900', 'h1200', 'h1500', 'h1800', 'h2100', 'm1', 'm2',
       'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11'],
      dtype='object')

In [40]:
x = df_clean2[features_col2].values
y = df_clean2.TempAr.values.reshape(-1,1)

In [41]:
model_xgb.fit(x, y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [42]:
r2_score(y, model_xgb.predict(x))

0.8699708876142914