In [64]:
%run Imports.ipynb
%run DataFrameBuilder.ipynb

In [206]:
def Predict(df, scaler = None, classifier = RandomForestClassifier(n_estimators=101, random_state=0), test_size = 0.25, disp = False):
    """ Method for train/test with split 
    Parameters:
        df (pandas):dataframe
        scaler (object): Instance of scaler
        classifier(object) : Instance of Classsifier
        test_size(float) : size of tests
        disp(bool) : display sns heatmap conf. matrix
    Returns:
        df(df):Modified dataframe  
    """
    X = df.loc[:, df.columns != 'target']
    y = df.loc[:, df.columns == 'target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    X_train_r = X_train.copy()
    if scaler is not None:
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    #print("---------------------Summary------------------")
    #print(classification_report(y_test,y_pred))
    #print("----------------Confusion_matrix--------------")
    #print(confusion_matrix(y_test,y_pred))
    #print("--------------------Accuracy------------------")
    #print(accuracy_score(y_test, y_pred))
    
    if disp == True:
        df_cm = confusion_matrix(y_test,y_pred)
        #plt.figure(figsize = (10,7))
        sns.set(font_scale=1.4)#for label size
        sns.heatmap(df_cm, annot=True,annot_kws={"size": 16})# font size
        plt.title('Confusion matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
    
    return  y_test , y_pred

def PredictRegression(df, scaler = None, classifier = RandomForestClassifier(n_estimators=101, random_state=0), test_size = 0.25):
    """ Method for train/test with split 
    Parameters:
        df (pandas):dataframe
        scaler (object): Instance of scaler
        classifier(object) : Instance of Classsifier
        test_size(float) : size of tests
    Returns:
        df(df):Modified dataframe  
    """    
    X = df.loc[:, df.columns != 'target']
    y = df.loc[:, df.columns == 'target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    X_train_r = X_train.copy()
    if scaler is not None:
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    classifier.fit(X_train, y_train.values.ravel())
    y_pred = classifier.predict(X_test)
    
    print("--------------------MAPE------------------")
    print(mean_absolute_percentage_error(y_train, y_pred))

    return  classifier , X_train_r

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = check_array(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def LinearRegressionCross(df):
    X = df.loc[:, df.columns != 'target']
    Y = df.loc[:, df.columns == 'target']
    kfold = model_selection.KFold(n_splits=10)
    model = LinearRegression()
    
    print("------------")
    scoring = 'neg_mean_absolute_error'
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    print("MAE: %.3f (%.3f)")
    print(results.mean())
    print(results.std())

    print("------------")
    scoring = 'neg_mean_squared_error'
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    print("MSE: %.3f (%.3f)")
    print(results.mean())
    print(results.std())

    print("------------")
    scoring = 'r2'
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    print("R^2: %.3f (%.3f)") 
    print(results.mean())
    print(results.std())

def RegressionPredictTwoClass(df, ratio = 0.8 ):
    train_size = int(len(df)*ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    reg = LinearRegression().fit(X_train, y_train)
    print("LR score>>>")
    print(reg.score(X_train, y_train))
    y_pred = reg.predict(X_test)
    
    rdf = pd.DataFrame(y_test, columns=["target"])
    rdf['prediction'] = y_pred
    
    rdf['prediction_r'] = np.where(rdf['prediction']>0, 'Rise', 'Fall')
    rdf['target_r'] = np.where(rdf['target']>0, 'Rise', 'Fall')
    
    sns.countplot(rdf.prediction_r)
    plt.show()
    
    print("---------------------Summary------------------")
    print(classification_report(rdf.target_r,rdf.prediction_r))
    print("----------------Confusion_matrix--------------")
    print(confusion_matrix(rdf.target_r,rdf.prediction_r))
    print("--------------------Accuracy------------------")
    print(accuracy_score(rdf.target_r,rdf.prediction_r))
    
def RegressionPredictThreeClass(df, ratio = 0.8, stay_boundary = 0.2 ):
    train_size = int(len(df)*ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    reg = LinearRegression().fit(X_train, y_train)
    print("LR score>>>")
    print(reg.score(X_train, y_train))
    y_pred = reg.predict(X_test)
    
    rdf = pd.DataFrame(y_test, columns=["target"])
    rdf['prediction'] = y_pred
    
    rdf['prediction_r'] = np.where(rdf['prediction']>0, 'Rise', 'Fall')
    rdf['target_r'] = np.where(rdf['target']>0, 'Rise', 'Fall')
    
    rdf['prediction_r'] = 'Rise'
    rdf['prediction_r'] = np.where((rdf.prediction > stay_boundary),'Rise',rdf.prediction_r)
    rdf['prediction_r'] = np.where((rdf.prediction <-(stay_boundary)),'Fall',rdf.prediction_r)
    rdf['prediction_r'] = np.where(((rdf.prediction >=-(stay_boundary)) & (rdf.prediction <=stay_boundary)),'Stay',rdf.prediction_r)
    
    rdf['target_r'] = 'Rise'
    rdf['target_r'] = np.where((rdf.target > stay_boundary),'Rise',rdf.target_r)
    rdf['target_r'] = np.where((rdf.target <-(stay_boundary)),'Fall',rdf.target_r)
    rdf['target_r'] = np.where(((rdf.target >=-(stay_boundary)) & (rdf.target <=stay_boundary)),'Stay',rdf.target_r)
    
    sns.countplot(rdf.prediction_r)
    plt.show()
    print("---------------------Summary------------------")
    print(classification_report(rdf.target_r,rdf.prediction_r))
    print("----------------Confusion_matrix--------------")
    print(confusion_matrix(rdf.target_r,rdf.prediction_r))
    print("--------------------Accuracy------------------")
    print(accuracy_score(rdf.target_r,rdf.prediction_r))


In [5]:
def Regression(df, train_ratio = 0.8 ):
    train_size = int(len(df)*train_ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    
    reg = LinearRegression().fit(X_train, y_train)

    y_pred = reg.predict(X_test)
    
    rdf = pd.DataFrame(y_test.values, columns=["y_test"])
    rdf['y_pred'] = y_pred
    
    return  rdf['y_test'].values, rdf['y_pred'].values 
    

In [3]:
def RegressionClassicPredictTwoClass(df, train_ratio = 0.8 ):
    train_size = int(len(df)*train_ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    
    reg = LinearRegression().fit(X_train, y_train)

    y_pred = reg.predict(X_test)
    
    rdf = pd.DataFrame(y_test.values, columns=["y_test"])
    rdf['y_pred'] = y_pred
    
    rdf['y_pred_r'] = np.where(rdf['y_pred']>=0, 'Rise', 'Fall')
    rdf['y_test_r'] = np.where(rdf['y_test']>=0, 'Rise', 'Fall')
    #import pdb; pdb.set_trace()
    
    return  rdf['y_test_r'].values, rdf['y_pred_r'].values 
    

In [227]:
def RegressionClassicPredictThreeClass(df, train_ratio = 0.8, stay_boundary = 0.2 ):
    train_size = int(len(df)*train_ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    
    reg = LinearRegression().fit(X_train, y_train)

    y_pred = reg.predict(X_test)
    
    rdf = pd.DataFrame(y_test.values, columns=["y_test_r"])
    rdf['y_pred_r'] = y_pred
    
    rdf['y_pred'] = 'Rise'
    rdf['y_pred'] = np.where((rdf.y_pred_r > stay_boundary),'Rise',rdf.y_pred)
    rdf['y_pred'] = np.where((rdf.y_pred_r <-(stay_boundary)),'Fall',rdf.y_pred)
    rdf['y_pred'] = np.where(((rdf.y_pred_r >=-(stay_boundary)) & (rdf.y_pred_r <=stay_boundary)),'Stay',rdf.y_pred)
    
    rdf['y_test'] = 'Rise'
    rdf['y_test'] = np.where((rdf.y_test_r > stay_boundary),'Rise',rdf.y_test)
    rdf['y_test'] = np.where((rdf.y_test_r <-(stay_boundary)),'Fall',rdf.y_test)
    rdf['y_test'] = np.where(((rdf.y_test_r >=-(stay_boundary)) & (rdf.y_test_r <=stay_boundary)),'Stay',rdf.y_test)
        
    return  rdf['y_test'].values, rdf['y_pred'].values 

In [None]:
def RegressionRollingPredictTwoClass(df, train_ratio = 0.8 ):
    train_size = int(len(df)*train_ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    
    y_pred = []
    
    for i in range(0, test_size):
        X_train_e = X_train
        X_e = X_test.head(i)
        X_train_e = X_train_e.append(X_e)
        Y_train_e = y_train
        Y_e = y_test.head(i)
        Y_train_e = Y_train_e.append(Y_e)
        
        reg = LinearRegression().fit(X_train_e, Y_train_e)       
        y_pred.append(reg.predict(X_test.iloc[[i]]))
        #import pdb; pdb.set_trace()
        
    rdf = pd.DataFrame(y_test.values, columns=["y_test"])
    rdf['y_pred'] = y_pred
    
    rdf['y_pred_r'] = np.where(rdf['y_pred']>=0, 'Rise', 'Fall')
    rdf['y_test_r'] = np.where(rdf['y_test']>=0, 'Rise', 'Fall')
    #import pdb; pdb.set_trace()
    
    return  rdf['y_test_r'].values, rdf['y_pred_r'].values 
    

In [7]:
def RegressionRolling(df, train_ratio = 0.8 ):
    train_size = int(len(df)*train_ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    
    y_pred = []
    
    for i in range(0, test_size):
        X_train_e = X_train
        X_e = X_test.head(i)
        X_train_e = X_train_e.append(X_e)
        Y_train_e = y_train
        Y_e = y_test.head(i)
        Y_train_e = Y_train_e.append(Y_e)
        
        reg = LinearRegression().fit(X_train_e, Y_train_e)       
        y_pred.append(reg.predict(X_test.iloc[[i]]))
        #import pdb; pdb.set_trace()
        
    rdf = pd.DataFrame(y_test.values, columns=["y_test"])
    rdf['y_pred'] = y_pred
    
    
    return  rdf['y_test'].values, rdf['y_pred'].values 

In [228]:
def RegressionRollingPredictThreeClass(df, train_ratio = 0.8 , stay_boundary = 0.2 ):
    train_size = int(len(df)*train_ratio)
    test_size = len(df)-train_size
    train_set = df.iloc[: train_size]
    test_set = df.iloc[-test_size:]
    X_train = train_set.loc[:, train_set.columns != 'target']
    y_train = train_set.loc[:, train_set.columns == 'target']
    X_test = test_set.loc[:, test_set.columns != 'target']
    y_test = test_set.loc[:, test_set.columns == 'target']
    
    y_pred = []
    
    for i in range(0, test_size):
        X_train_e = X_train
        X_e = X_test.head(i)
        X_train_e = X_train_e.append(X_e)
        Y_train_e = y_train
        Y_e = y_test.head(i)
        Y_train_e = Y_train_e.append(Y_e)
        
        reg = LinearRegression().fit(X_train_e, Y_train_e)       
        y_pred.append(reg.predict(X_test.iloc[[i]]))
        #import pdb; pdb.set_trace()
        
    rdf = pd.DataFrame(y_test.values, columns=["y_test_r"])
    rdf['y_pred_r'] = y_pred
    
    rdf['y_pred'] = 'Rise'
    rdf['y_pred'] = np.where((rdf.y_pred_r > stay_boundary),'Rise',rdf.y_pred)
    rdf['y_pred'] = np.where((rdf.y_pred_r <-(stay_boundary)),'Fall',rdf.y_pred)
    rdf['y_pred'] = np.where(((rdf.y_pred_r >=-(stay_boundary)) & (rdf.y_pred_r <=stay_boundary)),'Stay',rdf.y_pred)
    
    rdf['y_test'] = 'Rise'
    rdf['y_test'] = np.where((rdf.y_test_r > stay_boundary),'Rise',rdf.y_test)
    rdf['y_test'] = np.where((rdf.y_test_r <-(stay_boundary)),'Fall',rdf.y_test)
    rdf['y_test'] = np.where(((rdf.y_test_r >=-(stay_boundary)) & (rdf.y_test_r <=stay_boundary)),'Stay',rdf.y_test)
    #import pdb; pdb.set_trace()
    
    return  rdf['y_test'].values, rdf['y_pred'].values 
    

In [6]:
def DefArimaRegression(df, splitRatio = 0.8, p = 1, d = 1, q = 1):
    """ Method for Predicting forecast for test size, use just 'target' col from df
    Parameters:
        df (pandas):dataframe
        splitRatio (fload): train test split where split size == forecast 
        p, d, q(int) : ARIMA(p, d, q)
    Returns:
        rdf(df):dataframe with test and predict values
    """   
    trainshape = int(df.shape[0]*splitRatio)
    testshape = df.shape[0] - trainshape
    forecast = testshape
    
    train = df.target[:trainshape]
    test = df.target[trainshape:]
    y_pred = []
    y_test = test.head(forecast)
    
    ### Build Arimas
    for i in range(0, forecast):
        print(f'Arima {i} / {forecast}', end='\r')
        inner_train = train
        history = test.head(i)
        inner_train = inner_train.append(history)
        model = ARIMA(inner_train, order=(p, d, q))  
        fitted = model.fit(disp=-1) 
        fc, se, conf = fitted.forecast(1, alpha=0.05)  # 95% conf
        y_pred.append(fc)
         
    resdf = pd.DataFrame(y_test, index=y_test.index)
    resdf['prediction'] = np.array(y_pred)
    
    return  resdf['target'].values, resdf['prediction'].values 


In [224]:
def DefArima(df, splitRatio = 0.8, p = 1, d = 1, q = 1):
    """ Method for Predicting forecast for test size, use just 'target' col from df
    Parameters:
        df (pandas):dataframe
        splitRatio (fload): train test split where split size == forecast 
        p, d, q(int) : ARIMA(p, d, q)
    Returns:
        rdf(df):dataframe with test and predict values
    """   
    trainshape = int(df.shape[0]*splitRatio)
    testshape = df.shape[0] - trainshape
    forecast = testshape
    
    train = df.target[:trainshape]
    test = df.target[trainshape:]
    y_pred = []
    y_test = test.head(forecast)
    
    ### Build Arimas
    for i in range(0, forecast):
        print(f'Arima {i} / {forecast}', end='\r')
        inner_train = train
        history = test.head(i)
        inner_train = inner_train.append(history)
        model = ARIMA(inner_train, order=(p, d, q))  
        fitted = model.fit(disp=-1) 
        fc, se, conf = fitted.forecast(1, alpha=0.05)  # 95% conf
        y_pred.append(fc)
     
    
    resdf = pd.DataFrame(y_test, index=y_test.index)
    resdf['prediction'] = np.array(y_pred)
    
    resdf['y_pred_r'] = np.where(resdf['prediction']>=0, 'Rise', 'Fall')
    resdf['y_test_r'] = np.where(resdf['target']>=0, 'Rise', 'Fall')
    
    plt.figure(figsize=(12,5), dpi=100)
    plt.plot(train, label='training')
    plt.plot(resdf.target, label='actual')
    plt.plot(resdf.prediction, label='forecast')
    plt.title('Arima - pôvodná vs predikovaná hodnota')
    plt.legend(loc='upper left', fontsize=8)
    plt.show()
    #import pdb; pdb.set_trace()
    return  resdf['y_test_r'].values, resdf['y_pred_r'].values 


In [2]:
def DefArima3Class(df, splitRatio = 0.8, p = 1, d = 1, q = 1, stay_boundary = 0.2 ):
    """ Method for Predicting forecast for test size, use just 'target' col from df
    Parameters:
        df (pandas):dataframe
        splitRatio (fload): train test split where split size == forecast 
        p, d, q(int) : ARIMA(p, d, q)
    Returns:
        rdf(df):dataframe with test and predict values
    """   
    trainshape = int(df.shape[0]*splitRatio)
    testshape = df.shape[0] - trainshape
    forecast = testshape
    
    train = df.target[:trainshape]
    test = df.target[trainshape:]
    y_pred = []
    y_test = test.head(forecast)
    
    ### Build Arimas
    for i in range(0, forecast):
        print(f'Arima {i} / {forecast}', end='\r')
        inner_train = train
        history = test.head(i)
        inner_train = inner_train.append(history)
        model = ARIMA(inner_train, order=(p, d, q))  
        fitted = model.fit(disp=-1) 
        fc, se, conf = fitted.forecast(1, alpha=0.05)  # 95% conf
        y_pred.append(fc)
     
    
    resdf = pd.DataFrame(y_test, index=y_test.index)
    resdf['prediction'] = np.array(y_pred)
    
    resdf['y_pred'] = 'Rise'
    resdf['y_pred'] = np.where((resdf.prediction > stay_boundary),'Rise',resdf.y_pred)
    resdf['y_pred'] = np.where((resdf.prediction <-(stay_boundary)),'Fall',resdf.y_pred)
    resdf['y_pred'] = np.where(((resdf.prediction >=-(stay_boundary)) & (resdf.prediction <=stay_boundary)),'Stay',resdf.y_pred)
    
    resdf['y_test'] = 'Rise'
    resdf['y_test'] = np.where((resdf.target > stay_boundary),'Rise',resdf.y_test)
    resdf['y_test'] = np.where((resdf.target <-(stay_boundary)),'Fall',resdf.y_test)
    resdf['y_test'] = np.where(((resdf.target >=-(stay_boundary)) & (resdf.target <=stay_boundary)),'Stay',resdf.y_test)
    
    plt.figure(figsize=(12,5), dpi=100)
    plt.plot(train, label='training')
    plt.plot(resdf.target, label='actual')
    plt.plot(resdf.prediction, label='forecast')
    plt.title('Arima - pôvodná vs predikovaná hodnota')
    plt.legend(loc='upper left', fontsize=8)
    plt.show()
    #import pdb; pdb.set_trace()
    return  resdf['y_test'].values, resdf['y_pred'].values 
