In [1]:
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import numpy as np

In [2]:
data = pd.read_csv("../project_two-master/data/transistors-per-microprocessor.csv")
data = data.rename(columns={'Transistors per microprocessor (transistors per chip)':'Transistors'})
data = data.set_index('Year')
data.head()

Unnamed: 0_level_0,Entity,Code,Transistors
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1971,World,OWID_WRL,2308
1972,World,OWID_WRL,3555
1974,World,OWID_WRL,6098
1979,World,OWID_WRL,29164
1982,World,OWID_WRL,135773


In [3]:
def regression_results(y_true, y_pred):
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [4]:
data = data[['Transistors']]
data.loc[:,'t-1'] = data.loc[:,'Transistors'].shift()
data.loc[:,'t-1_Diff'] = data.loc[:,'t-1'].diff()
data['t-2'] = data['t-1'].shift()
data['t-2_Diff'] = data['t-2'].diff()
data = data.dropna()
data.head()

Unnamed: 0_level_0,Transistors,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1979,29164,6098.0,2543.0,3555.0,1247.0
1982,135773,29164.0,23066.0,6098.0,2543.0
1985,273842,135773.0,106609.0,29164.0,23066.0
1989,1207901,273842.0,138069.0,135773.0,106609.0
1993,3105900,1207901.0,934059.0,273842.0,138069.0


In [5]:
X = data.drop(['Transistors'], axis = 1)
y = data['Transistors']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score
rmse_score = make_scorer(rmse, greater_is_better = False)

In [16]:
model = RandomForestRegressor()
param_search = { 
    'n_estimators': [20, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [i for i in range(5,15)]
}
tscv = TimeSeriesSplit(n_splits=10)
gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = rmse_score)
gsearch.fit(X_train, y_train)
best_score = gsearch.best_score_
best_model = gsearch.best_estimator_
y_true = y_test.values
y_pred = best_model.predict(X_test)
regression_results(y_true, y_pred)

explained_variance:  0.8209
mean_squared_log_error:  0.2871
r2:  0.8181
MAE:  643578587.869
MSE:  1.0275396377711313e+18
RMSE:  1013676298.3177


In [17]:
data.to_csv('transistors_auto.csv')

In [18]:
next_value = {
    'Year':2018,
    't-1':19200000000,
    't-1_Diff':9200000000,
    't-2':10000000000,
    't-2_Diff':4300000000
}

In [19]:
next_value = pd.DataFrame(data=next_value, index=['0'])
next_value = next_value.set_index('Year')
next_value

Unnamed: 0_level_0,t-1,t-1_Diff,t-2,t-2_Diff
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,19200000000,9200000000,10000000000,4300000000


In [20]:
best_model.predict(next_value)

array([1.22989625e+10])

In [21]:
import pickle
filename = 'transistor.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [22]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8181124812775176
