In [269]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


In [258]:
class DataPreprocessing():
    def __init__(self):
        self.dataframe = None
        self.X = None
        self.y1 = None
        self.y2 = None
    def read_from_csv(self):
        df=pd.read_csv('datavn30fm.csv', index_col=False)
        self.dataframe = df
    def set_attributes_and_output(self):
        self.X = self.dataframe.values[:-1,1:-2]
        self.y2 = self.dataframe.values[1:, -1]
        self.y1 = self.dataframe.values[1:, -5]
    def final_train_test_data(self,attributes_list=[1,2,3,4,5], test_size=0.2):
        return \
            train_test_split(
                self.X[:, attributes_list],
                self.y1,
                shuffle=False,
                test_size=test_size)

        
                

In [261]:
dp = DataPreprocessing()
dp.read_from_csv()
dp.set_attributes_and_output()

X_train, X_test, y_train, y_test = \
    dp.final_train_test_data(attributes_list=[1,2,3,5], test_size=0.2)
print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)
print(X_train[0:5])


Shape of X_train:  (45969, 4)
Shape of y_train:  (45969,)
Shape of X_test:  (11493, 4)
Shape of y_test:  (11493,)
[[943.6 942.9 943.1 271.7999999999588]
 [943.5 942.9 943.3 -132.29999999987967]
 [943.3 942.6 943.1 -181.04999999997256]
 [943.1 942.3 942.6 119.6000000000272]
 [943.7 942.4 943.7 -1147.2500000001603]]


In [263]:
from sklearn.metrics import mean_squared_error,precision_score

class BaseClassRegressionAnalysis():
    def __init__(self):
        # Initialize a regressor, which will handle the LinearRegression model 
        self.regressor = LinearRegression()
    
    def fit(self, X, y):
        # The regressor learn from the training data with input X and output y
        self.regressor.fit(X, y)
        
    def predict(self, X):
        # The regressor predict the result with input X (after being trained)
        # The output has the same size as output y
        return self.regressor.predict(X)
    
    def mean_square_error(self, y_real, y_predict):
        # Compare the 2 output vectors: real output and prediction, using mean square error
        return mean_squared_error(y_real, y_predict)
    def precision_scored(self, y_real, y_predict):
        # Return the value of the precision scores
        return precision_score(y_real, y_predict,zero_division=0)
    
    def visualize_prediction(self, y_real, y_predict):
        # Visualize the 2 output vectors: real output and prediction with each instances 
        x = np.array([i for i in range(len(y_predict))]) # Numbering the instances from 0 for x-axis of the plot
        plt.plot(x, y_real, label = 'Real')
        plt.plot(x, y_predict, label = 'Predict')
        plt.xlabel('Instances')
        plt.ylabel('Prediction & Real Prices')
        plt.legend()
        plt.show()

In [264]:
class LinearRegressionAnalysis(BaseClassRegressionAnalysis):
    pass
     

In [266]:
class PolynomialRegressionAnalysis(BaseClassRegressionAnalysis):
    def __init__(self, degree):
        super().__init__()
        self.degree = degree
    
    def __poly_transform(self, X):
        poly = PolynomialFeatures(degree=self.degree)
        Xt = poly.fit_transform(X)
        return Xt
    
    def fit(self, X, y, degree=2):
        Xt = self.__poly_transform(X)
        super().fit(Xt, y)
        
    def predict(self, X):
        Xt = self.__poly_transform(X)
        return super().predict(Xt)

In [267]:
# Initialize a regressor (a model) to learn from data
lr = LinearRegressionAnalysis()

# The regressor will learn from the input and output of training data
lr.fit(X_train, y_train)


# After learning from training data, the model will make a prediction based on input testing data
y_pred = [1 if i>0.5 else 0 for i in lr.predict(X_test)]
y_test = [1 if x>0.5 else 0 for x in y_test]
# Comparision and visualization
print('First 10 instances prediction :     ', np.array([round(i, 1) for i in y_pred[:10]]))
print('Real output of first 10 instances : ', y_test[:10])
print('Precision of prediction : ', lr.precision_scored(y_test,y_pred))
score =0
for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        score +=1
print('Score : ', score/len(y_test)*2-1)
#lr.visualize_prediction(y1_test, y1_pred)


First 10 instances prediction :      [1 1 1 1 1 1 1 1 1 1]
Real output of first 10 instances :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Precision of prediction :  1.0
Score :  1.0


In [236]:
pd.DataFrame(y_pred).value_counts()


1    10744
0      749
dtype: int64

In [237]:
pd.DataFrame(y_test).value_counts()

0    6033
1    5460
dtype: int64

In [245]:
maxscore =0
good_p=0
precision_score_p=0
max_precision_score = 0
T=len(y_test)
for p in range(0,100,1):
    y_pred = [1 if i>p/100 else 0 for i in lr.predict(X_test)]
    y_test = [1 if x>0.5 else 0 for x in y_test]
    score =0
    for i in range(len(y_test)):
        if y_test[i] == y_pred[i]:
            score +=1
    if maxscore < abs(score/T*2-1):
        maxscore = abs(score/T*2-1)
        good_p=p/100
    if max_precision_score < lr.precision_scored(y_test,y_pred):
        max_precision_score = lr.precision_scored(y_test,y_pred)
        precision_score_p=p/100
print(maxscore,good_p,max_precision_score,precision_score_p)
y_pred = [1 if i>precision_score_p else 0 for i in lr.predict(X_test)]
count=0
for i in y_pred:
    if i ==1:
        count+=1
print(count)
y_pred = [1 if i>-0.18 else 0 for i in lr.predict(X_test)]


0.04985643435134435 0.0 0.5 0.54
32


In [246]:
pd.DataFrame(y_pred).value_counts()

1    11493
dtype: int64

In [247]:
pd.DataFrame(y_test).value_counts()

0    6033
1    5460
dtype: int64

In [242]:
def backtest(data, model, predictors, start=1000, step=750):
    predictions = []
    # Loop over the dataset in increments
    for i in range(start, data.shape[0], step):
        # Split into train and test sets
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()

        # Fit the random forest model
        model.fit(train[predictors], train["Target"])

        # Make predictions
        preds = model.predict_proba(test[predictors])[:,1]
        preds = pd.Series(preds, index=test.index)
        preds[preds > 0.5] = 1
        preds[preds<=0.5] = 0

        # Combine predictions and test values
        combined = pd.concat({"Target": test["Target"],"Predictions": preds}, axis=1)

        predictions.append(combined)

    return pd.concat(predictions)

In [224]:
#predictions = backtest(dp,LinearRegression,[],start = 1000,step = 750)

AttributeError: 'DataPreprocessing' object has no attribute 'shape'

In [209]:

'''maxscore =0
good_p=0
precision_score_p=0
max_precision_score = 0
T=len(y1_test)
for p in range(0,101,1):
    y1_pred = [1 if i>p/100 else 0 for i in pr.predict(X_test)]
    y1_test = [int(x) for x in y_test]
    score =0
    for i in range(len(y1_test)):
        if y1_test[i] == y1_pred[i]:
            score +=1
    if maxscore < abs(score/T*2-1):
        maxscore = abs(score/T*2-1)
        good_p=p/100
    if max_precision_score < pr.precision_scored(y1_test,y1_pred):
        max_precision_score = pr.precision_scored(y1_test,y1_pred)
        precision_score_p=p/100
print(maxscore,good_p,max_precision_score,precision_score_p)
y1_pred = [1 if i>precision_score_p else 0 for i in pr.predict(X_test)]
count=0
for i in y1_pred:
    if i ==1:
        count+=1
print(count)
'''

'maxscore =0\ngood_p=0\nprecision_score_p=0\nmax_precision_score = 0\nT=len(y1_test)\nfor p in range(0,101,1):\n    y1_pred = [1 if i>p/100 else 0 for i in pr.predict(X_test)]\n    y1_test = [int(x) for x in y_test]\n    score =0\n    for i in range(len(y1_test)):\n        if y1_test[i] == y1_pred[i]:\n            score +=1\n    if maxscore < abs(score/T*2-1):\n        maxscore = abs(score/T*2-1)\n        good_p=p/100\n    if max_precision_score < pr.precision_scored(y1_test,y1_pred):\n        max_precision_score = pr.precision_scored(y1_test,y1_pred)\n        precision_score_p=p/100\nprint(maxscore,good_p,max_precision_score,precision_score_p)\ny1_pred = [1 if i>precision_score_p else 0 for i in pr.predict(X_test)]\ncount=0\nfor i in y1_pred:\n    if i ==1:\n        count+=1\nprint(count)\n'

In [210]:
# Initialize a regressor (a model) to learn from data
'''
pr = PolynomialRegressionAnalysis(2)

# The regressor will learn from the input and output of training data
X_train, X_test, y_train, y_test = \
    dp.final_train_test_data(attributes_list=[2,4,5], test_size=0.2)
pr.fit(X_train, y_train)

# After learning from training data, the model will make a prediction based on input testing data
y_pred = [1 if i>0 else 0 for i in pr.predict(X_test)]
y_test1 = [int(x) for x in y_test]

# Comparision and visualization
print('First 10 instances prediction :', np.array([i for i in y_pred[:10]]))
print('Real output of first 10 instances :', y_test[:10])
print('Precision of prediction : ', pr.precision_scored(y_test1,y_pred))
score =0
for i in range(len(y1_test)):
    if y1_test[i] == y1_pred[i]:
        score +=1
print('Score : ', score/len(y1_test)*2-1)
'''

"\npr = PolynomialRegressionAnalysis(2)\n\n# The regressor will learn from the input and output of training data\nX_train, X_test, y_train, y_test =     dp.final_train_test_data(attributes_list=[2,4,5], test_size=0.2)\npr.fit(X_train, y_train)\n\n# After learning from training data, the model will make a prediction based on input testing data\ny_pred = [1 if i>0 else 0 for i in pr.predict(X_test)]\ny_test1 = [int(x) for x in y_test]\n\n# Comparision and visualization\nprint('First 10 instances prediction :', np.array([i for i in y_pred[:10]]))\nprint('Real output of first 10 instances :', y_test[:10])\nprint('Precision of prediction : ', pr.precision_scored(y_test1,y_pred))\nscore =0\nfor i in range(len(y1_test)):\n    if y1_test[i] == y1_pred[i]:\n        score +=1\nprint('Score : ', score/len(y1_test)*2-1)\n"