In [None]:
import os
import random
import tempfile
import pickle

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
np.random.seed(699)

os.environ['PYTHONHASHSEED'] = '0'
random.seed(699)

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
path='drive/MyDrive/Capstone(Team10)/code'


#file = tf.keras.utils
raw_df = pd.read_csv(path+'/data/rawdata_USA.csv', index_col=0, parse_dates=True)
raw_df.index.name='date'
metadata=pd.read_csv(path+'/data/metadata_final.csv')

raw_df.tail(5)

Mounted at /content/drive


FileNotFoundError: ignored

In [None]:
with open(path+'/data/X_data_full.pkl','rb') as f:
  X_data=pickle.load(f)

In [None]:
## reference : https://gmnam.tistory.com/230#:~:text=class%20BlockingTimeSeriesSplit%28%29%3A%20def%20__init__%28self%2C%20n_splits%29%3A%20self.n_splits%20%3D%20n_splits,indices%20%5Bstart%3A%20mid%5D%2C%20indices%20%5Bmid%20%2B%20margin%3A%20stop%5D


from sklearn.model_selection import TimeSeriesSplit
from matplotlib.patches import Patch
import matplotlib.pyplot as plt

cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
plt.style.use('fivethirtyeight')

class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

def plot_cv_indices(cv, X, n_splits, lw=10):

    fig, ax = plt.subplots()
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Formatting
    yticklabels = list(range(n_splits))
    ax.set(yticks=np.arange(n_splits) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+0.1, -.1], xlim=[0, len(X)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)

    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
          ['Testing set', 'Training set'], loc=(1.02, .8))

In [None]:
## make index for train, validation dataset
def make_split(X_data, n_splits, test_year):
  train_idxs=[]
  val_idxs=[]

  tss=BlockingTimeSeriesSplit(n_splits=n_splits)

  for train_idx, val_idx in tss.split(X_data[:-(test_year*12)]):
    train_idxs.append(train_idx)
    val_idxs.append(val_idx)

  return train_idxs, val_idxs

In [None]:
def classification_report_csv(report):
    report_data = []
    lines = report.split('\n')

    for line in lines[2:len(lines)-5]:
        row = {}
        row_data = [val for val in line.split(' ') if val!='']
        row['class'] = round(float(row_data[0]),0)
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        row['accuracy']=float([val for val in lines[-4].split(' ') if val!=''][-2])
        report_data.append(row)

    df = pd.DataFrame(report_data)
    return df

1. Random Forest Modeling

In [None]:
## method=class weighted method, 'balanced', 'None'

def rf_grid_search(path, X_data, y_data, y_type, n_splits, test_year, param_rf):

  X_train=X_data.iloc[:-test_year*12]
  y_train=y_data.iloc[:-test_year*12]
  X_test=X_data.iloc[-test_year*12:]
  y_test=y_data.iloc[-test_year*12:]

  tss=BlockingTimeSeriesSplit(n_splits=n_splits)

  model = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=699),
                     param_rf, cv=tss,
                     verbose=3, n_jobs=-1, scoring=['recall_macro'],
                     refit='recall_macro'
                     )

  clf = make_pipeline(StandardScaler(), model)



  # if y_type!='y_agg':
  #   neg, pos = np.bincount(y_train)
  #   total = neg + pos

  #   weight_for_0 = (1 / neg) * (total / 2.0)
  #   weight_for_1 = (1 / pos) * (total / 2.0)

  #   class_weight = {0: weight_for_0, 1: weight_for_1}

  # else:
  #   neg, pos1, pos2  = np.bincount(y_train)
  #   total = neg + pos1 + pos2

  #   weight_for_0 = (1 / neg) * (total / 3.0)
  #   weight_for_1 = (1 / pos1) * (total / 3.0)
  #   weight_for_2 = (1 / pos2) * (total / 3.0)

  #   class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

  clf.fit(X_train, y_train)

  y_pred_prob=clf.predict_proba(X_test)
  y_pred = clf.predict(X_test)

  clf_report=classification_report(y_test, y_pred)

  param=model.cv_results_['params']
  mean_test_score=model.cv_results_['mean_test_recall_macro']
  std_test_score=model.cv_results_['std_test_recall_macro']
  rank_test_score=model.cv_results_['rank_test_recall_macro']

  for idx, x in enumerate(param):
      x['model']='RF'
      x['y_type']=y_type
      x['mean_test_recall']=mean_test_score[idx]
      x['std_test_recall']=std_test_score[idx]
      x['rank_test_recall']=rank_test_score[idx]

  df_cvresult=pd.DataFrame(param)

  ################# revise
  with open (path+'/model/clf_{}_rf.pkl'.format(y_type), 'wb') as f:
      pickle.dump([clf, df_cvresult, y_pred, y_pred_prob, clf_report], f)

In [None]:
param_rfs=[{'n_estimators':[30,50,100],
             'max_features':[0.2, 0.3, 0.5, 0.7, 0.9],
           'class_weight':['balanced'],  #,'balanced_subsample',None
           'warm_start':[True,False]}]  ## you can add more dictionary for other combinations of parameters.
y_types=['y_agg','y_oecd','y_nber']
test_year=8
n_splits=3
threshold=50

for param_rf in param_rfs:
  for y_type in y_types:
    y=raw_df[y_type]
    y_data=y[-(threshold*12+6):-6]
    rf_grid_search(path, X_data, y_data, y_type, n_splits, test_year, param_rf)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


  warn(


Fitting 3 folds for each of 30 candidates, totalling 90 fits


  warn(


Fitting 3 folds for each of 30 candidates, totalling 90 fits


  warn(


In [None]:
y_types=['y_agg','y_oecd']

dict_rf={}

for y_type in y_types:
################# revise
  with open (path+'/model/clf_{}_rf.pkl'.format(y_type), 'rb') as f:
      [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)
  ################# revise
      dict_rf['clf_{}_rf.pkl'.format(y_type)]=[clf, df_cvresult, y_pred, y_pred_prob, clf_report]

for idx, model in enumerate(dict_rf):
    if idx==0:
        df_rf=dict_rf[model][1]
    else:
        df_tmp=dict_rf[model][1]
        df_rf=pd.concat([df_rf, df_tmp])

df_rf[df_rf.rank_test_recall==1]

Unnamed: 0,class_weight,max_features,n_estimators,warm_start,model,y_type,mean_test_recall,std_test_recall,rank_test_recall
2,balanced,0.2,50,True,RF,y_agg,0.485891,0.119221,1
3,balanced,0.2,50,False,RF,y_agg,0.485891,0.119221,1
4,balanced,0.2,100,True,RF,y_agg,0.485891,0.119221,1
5,balanced,0.2,100,False,RF,y_agg,0.485891,0.119221,1
6,balanced,0.3,30,True,RF,y_oecd,0.58122,0.05936,1
7,balanced,0.3,30,False,RF,y_oecd,0.58122,0.05936,1


In [None]:
for idx, model in enumerate(dict_rf.keys()):

    report=dict_rf[model][-1]

    if idx==0:
        df_rf_creport=classification_report_csv(report)
        df_rf_creport['model']=model
    else:
        df_tmp=classification_report_csv(report)
        df_tmp['model']=model
        df_rf_creport=pd.concat([df_rf_creport, df_tmp])


df_rf_creport

Unnamed: 0,class,precision,recall,f1_score,support,accuracy,model
0,0.0,0.71,0.82,0.76,68.0,0.64,clf_y_agg_rf.pkl
1,1.0,0.21,0.12,0.15,26.0,0.64,clf_y_agg_rf.pkl
2,2.0,0.67,1.0,0.8,2.0,0.64,clf_y_agg_rf.pkl
0,0.0,0.75,0.79,0.77,68.0,0.67,clf_y_oecd_rf.pkl
1,1.0,0.42,0.36,0.38,28.0,0.67,clf_y_oecd_rf.pkl


2. Support Vector Machine Classifier

In [None]:
y=raw_df[y_type]
y_data=y[-(threshold*12+6):-6]

def svc_grid_search(path, X_data, y_data, y_type, n_splits, test_year, param_svc):

  X_train=X_data.iloc[:-test_year*12]
  y_train=y_data.iloc[:-test_year*12]
  X_test=X_data.iloc[-test_year*12:]
  y_test=y_data.iloc[-test_year*12:]


  # if y_type!='y_agg':
  #   neg, pos = np.bincount(y_train)
  #   total = neg + pos

  #   weight_for_0 = (1 / neg) * (total / 2.0)
  #   weight_for_1 = (1 / pos) * (total / 2.0)

  #   class_weight = {0: weight_for_0, 1: weight_for_1}

  # else:
  #   neg, pos1, pos2  = np.bincount(y_train)
  #   total = neg + pos1 + pos2

  #   weight_for_0 = (1 / neg) * (total / 3.0)
  #   weight_for_1 = (1 / pos1) * (total / 3.0)
  #   weight_for_2 = (1 / pos2) * (total / 3.0)

    # class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

  tss=BlockingTimeSeriesSplit(n_splits=n_splits)

  model = GridSearchCV(SVC(random_state=699, probability=True),
                     param_svc, cv=tss,
                     verbose=3, n_jobs=-1, scoring=['recall_macro'],
                     refit='recall_macro'
                     )

  clf = make_pipeline(StandardScaler(), model)

  clf.fit(X_train, y_train)

  y_pred_prob=clf.predict_proba(X_test)
  y_pred = clf.predict(X_test)

  clf_report=classification_report(y_test, y_pred)

  param=model.cv_results_['params']
  mean_test_score=model.cv_results_['mean_test_recall_macro']
  std_test_score=model.cv_results_['std_test_recall_macro']
  rank_test_score=model.cv_results_['rank_test_recall_macro']

  for idx, x in enumerate(param):
      x['model']='SVC'
      x['y_type']=y_type
      x['mean_test_recall']=mean_test_score[idx]
      x['std_test_recall']=std_test_score[idx]
      x['rank_test_recall']=rank_test_score[idx]

  df_cvresult=pd.DataFrame(param)

  ################# revise
  with open (path+'/model/clf_{}_svc.pkl'.format(y_type), 'wb') as f:
      pickle.dump([clf, df_cvresult, y_pred, y_pred_prob, clf_report], f)

In [None]:
param_svcs=[{'C':[0.5, 1, 1.5, 2, 5],
             'kernel':['poly','rbf','sigmoid']}]  ## you can add more dictionary for other combinations of parameters.
y_types=['y_agg','y_oecd']
test_year=8
n_splits=3

for param_svc in param_svcs:
  for y_type in y_types:
    svc_grid_search(path, X_data, y_data, y_type, n_splits, test_year, param_svc)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [None]:
y_types=['y_agg','y_oecd']

dict_svc={}

for y_type in y_types:
################# revise
  with open (path+'/model/clf_{}_svc.pkl'.format(y_type), 'rb') as f:
      [clf, df_cvresult, y_pred, y_pred_prob, clf_report]=pickle.load(f)
  ################# revise
      dict_svc['clf_{}_svc.pkl'.format(y_type)]=[clf, df_cvresult, y_pred, y_pred_prob, clf_report]

for idx, model in enumerate(dict_svc):
    if idx==0:
        df_svc=dict_svc[model][1]
    else:
        df_tmp=dict_svc[model][1]
        df_svc=pd.concat([df_svc, df_tmp])

df_svc[df_svc.rank_test_recall==1]

Unnamed: 0,C,kernel,model,y_type,mean_test_recall,std_test_recall,rank_test_recall
8,1.5,sigmoid,SVC,y_agg,0.660711,0.165592,1
8,1.5,sigmoid,SVC,y_oecd,0.660711,0.165592,1


In [None]:
for idx, model in enumerate(dict_svc.keys()):

    report=dict_svc[model][-1]

    if idx==0:
        df_svc_creport=classification_report_csv(report)
        df_svc_creport['model']=model
    else:
        df_tmp=classification_report_csv(report)
        df_tmp['model']=model
        df_svc_creport=pd.concat([df_svc_creport, df_tmp])

df_svc_creport

Unnamed: 0,class,precision,recall,f1_score,support,accuracy,model
0,0.0,0.67,0.49,0.56,68.0,0.47,clf_y_agg_svc.pkl
1,1.0,0.26,0.43,0.32,28.0,0.47,clf_y_agg_svc.pkl
0,0.0,0.67,0.49,0.56,68.0,0.47,clf_y_oecd_svc.pkl
1,1.0,0.26,0.43,0.32,28.0,0.47,clf_y_oecd_svc.pkl


In [None]:
dict_list=[df_rf, df_rf_creport, df_svc, df_svc_creport]
id_list=['df_rf', 'df_rf_creport', 'df_svc', 'df_svc_creport']

for i, dict_ in enumerate(dict_list):
################# revise
    with open(path+'model/{}.pkl'.format(id_list[i]), 'wb') as f:
        pickle.dump(dict_, f)