In [1]:
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

In [2]:
data = pd.read_csv("../project_two-master/data/supercomputer-power-flops.csv")
data = data.rename(columns={'Floating-Point Operations per Second (FLOPS)':'Flops'})
data = data.set_index('Year')
data.head()

Unnamed: 0_level_0,Entity,Code,Flops
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1993,World,OWID_WRL,124000000000
1994,World,OWID_WRL,170000000000
1995,World,OWID_WRL,170000000000
1996,World,OWID_WRL,368000000000
1997,World,OWID_WRL,1300000000000


In [3]:
def regression_results(y_true, y_pred):
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [5]:
data = data[['Flops']]
data.loc[:,'t-1'] = data.loc[:,'Flops'].shift()
data.loc[:,'t-1_Diff'] = data.loc[:,'t-1'].diff()
data['t-2'] = data['t-1'].shift()
data['t-2_Diff'] = data['t-2'].diff()
data = data.dropna()
data.head()

Unnamed: 0_level_0,Flops,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1999,2400000000000,1300000000000.0,0.0,1300000000000.0,932000000000.0
2000,4900000000000,2400000000000.0,1100000000000.0,1300000000000.0,0.0
2001,7200000000000,4900000000000.0,2500000000000.0,2400000000000.0,1100000000000.0
2002,35900000000000,7200000000000.0,2300000000000.0,4900000000000.0,2500000000000.0
2003,35900000000000,35900000000000.0,28700000000000.0,7200000000000.0,2300000000000.0


In [6]:
X = data.drop(['Flops'], axis = 1)
y = data['Flops']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

In [9]:
model = RandomForestRegressor()
param_search = { 
    'n_estimators': [20, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,15)]
}
tscv = TimeSeriesSplit(n_splits=10)
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_
y_true = y_test.values
y_pred = best_model.predict(X_test)
regression_results(y_true, y_pred)

explained_variance:  0.8797
mean_squared_log_error:  1.1253
r2:  0.8519
MAE:  2382384600000000.0
MSE:  2.64464507573194e+31
RMSE:  5142611278068701.0


In [10]:
#data.to_csv('flops_auto.csv')

In [11]:
next_value = {
    'Year':2018,
    't-1':93000000000000000,
    't-1_Diff':0,
    't-2':93000000000000000,
    't-2_Diff':0
}

In [12]:
next_value = pd.DataFrame(data=next_value, index=['0'])
next_value = next_value.set_index('Year')
next_value

Unnamed: 0_level_0,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,93000000000000000,0,93000000000000000,0


In [13]:
best_model.predict(next_value)

array([7.1138e+16])

In [15]:
import pickle
filename = 'flops.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [16]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8519216916671606
