In [3]:
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

In [6]:
data = pd.read_csv("../../microprocessor-clock-speed.csv")
data = data.rename(columns={'Microprocessor clock speed (Hertz (pulses per second))':'Pulses'})
data = data.set_index('Year')
data.head()

Unnamed: 0_level_0,_id,Entity,Code,Pulses
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1976,5f1a1d0557388108b2fc21be,World,OWID_WRL,1350000
1977,5f1a1d0557388108b2fc21bf,World,OWID_WRL,2060000
1978,5f1a1d0557388108b2fc21c0,World,OWID_WRL,2140000
1979,5f1a1d0557388108b2fc21c1,World,OWID_WRL,2290000
1980,5f1a1d0557388108b2fc21c2,World,OWID_WRL,1940000


In [7]:
def regression_results(y_true, y_pred):
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [8]:
data = data[['Pulses']]
data.loc[:,'t-1'] = data.loc[:,'Pulses'].shift()
data.loc[:,'t-1_Diff'] = data.loc[:,'t-1'].diff()
data['t-2'] = data['t-1'].shift()
data['t-2_Diff'] = data['t-2'].diff()
data = data.dropna()

In [9]:
data.head()

Unnamed: 0_level_0,Pulses,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1979,2290000,2140000.0,80000.0,2060000.0,710000.0
1980,1940000,2290000.0,150000.0,2140000.0,80000.0
1981,2410000,1940000.0,-350000.0,2290000.0,150000.0
1982,2630000,2410000.0,470000.0,1940000.0,-350000.0
1983,4070000,2630000.0,220000.0,2410000.0,470000.0


In [10]:
X = data.drop(['Pulses'], axis = 1)
y = data['Pulses']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

In [13]:
model = RandomForestRegressor()
param_search = { 
    'n_estimators': [20, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,15)]
}
tscv = TimeSeriesSplit(n_splits=10)
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_
y_true = y_test.values
y_pred = best_model.predict(X_test)
regression_results(y_true, y_pred)

explained_variance:  0.9113
mean_squared_log_error:  1.3161
r2:  0.9021
MAE:  446954250.0
MSE:  4.432480159782499e+17
RMSE:  665768740.6136


In [14]:
next_value = {
    'Year':2017,
    't-1':28751000000,
    't-1_Diff':9403000000,
    't-2':19348000000,
    't-2_Diff':7837000000
}

In [15]:
next_value = pd.DataFrame(data=next_value, index=['0'])
next_value = next_value.set_index('Year')
next_value

Unnamed: 0_level_0,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,28751000000,9403000000,19348000000,7837000000


In [16]:
best_model.predict(next_value)

array([2.561655e+10])

In [17]:
import pickle

In [18]:
filename = 'pulses.sav'
pickle.dump(best_model, open(filename, 'wb'))


In [21]:
X_test.head(20)

Unnamed: 0_level_0,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007,5631000000.0,458000000.0,5173000000.0,1183000000.0
1982,2410000.0,470000.0,1940000.0,-350000.0
1998,184280000.0,43780000.0,140500000.0,62460000.0
2003,2317000000.0,633000000.0,1684000000.0,1270320000.0
2002,1684000000.0,1270320000.0,413680000.0,76680000.0
2001,413680000.0,76680000.0,337000000.0,152720000.0
1999,337000000.0,152720000.0,184280000.0,43780000.0
1997,140500000.0,62460000.0,78040000.0,24660000.0


In [19]:
filename = 'pulses.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9020988485514694


In [30]:
data.iloc[len(data)-1]

Pulses      2.875100e+10
t-1         1.934800e+10
t-1_Diff    7.837000e+09
t-2         1.151100e+10
t-2_Diff    4.772000e+09
Name: 2016, dtype: float64

In [33]:
last_year = data.iloc[len(data)-1]

In [54]:
last_year_index = list(data.index)[len(data) -1]

In [55]:
new_t_minus_1 = last_year["Pulses"]
new_t_diff = last_year["Pulses"] - last_year["t-1"]
new_t_minus_2 = last_year["t-1"]
new_t_diff_2 = last_year["t-1"] - last_year["t-2"]

In [56]:
new_t_diff

9403000000.0

In [57]:
new_t_diff_2

7837000000.0

In [59]:
def createNewYearDF(last_year, last_year_index):
    new_t_minus_1 = last_year["Pulses"]
    new_t_diff = last_year["Pulses"] - last_year["t-1"]
    new_t_minus_2 = last_year["t-1"]
    new_t_diff_2 = last_year["t-1"] - last_year["t-2"]
    return pd.DataFrame({
        "year": last_year_index+1,
        "t-1": new_t_minus_1,
        "t-1_Diff": new_t_diff,
        "t-2": new_t_minus_2,
        "t-2_Diff": new_t_diff_2
    },index=['0']).set_index("year")



In [60]:
new_year_df = createNewYearDF(last_year, last_year_index)
new_year_df

Unnamed: 0_level_0,t-1,t-1_Diff,t-2,t-2_Diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,28751000000.0,9403000000.0,19348000000.0,7837000000.0


In [61]:
new_pulses = best_model.predict(new_year_df)
new_year_index = new_year_df.index[0]

In [62]:
services_body = {
        "year": new_year_index,
        "Pulses": new_pulses,
        "t-1": new_t_minus_1,
        "t-1_Diff": new_t_diff,
        "t-2": new_t_minus_2,
        "t-2_Diff": new_t_diff_2
}

In [63]:
services_body

{'year': 2017,
 'Pulses': array([2.561655e+10]),
 't-1': 28751000000.0,
 't-1_Diff': 9403000000.0,
 't-2': 19348000000.0,
 't-2_Diff': 7837000000.0}