In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


In [3]:
class DataPreprocessing():
    def __init__(self):
        self.dataframe = None
        self.X = None
        self.y1 = None
        self.y2 = None
    def read_from_csv(self):
        df=pd.read_csv('datavn30fm.csv', index_col=False)
        self.dataframe = df
    def set_attributes_and_output(self):
        self.X = self.dataframe.values[:-1,1:-2]
        self.y2 = self.dataframe.values[1:, -1]
        self.y1 = self.dataframe.values[1:, -5]
    def final_train_test_data(self,attributes_list=[1,2,3,4,5], test_size=0.2):
        return \
            train_test_split(
                self.X[:, attributes_list],
                self.y1,
                shuffle=False,
                test_size=test_size)


In [18]:
dp = DataPreprocessing()
dp.read_from_csv()
dp.set_attributes_and_output()

X_train, X_test, y_train, y_test = \
    dp.final_train_test_data(attributes_list=[1,2,3,5], test_size=0.2)
print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)
print(y_test[0:5])


Shape of X_train:  (45969, 4)
Shape of y_train:  (45969,)
Shape of X_test:  (11493, 4)
Shape of y_test:  (11493,)
[1491.8 1490.8 1491.3 1491.1 1491.9]


In [5]:
from sklearn.metrics import mean_squared_error,precision_score

class BaseClassRegressionAnalysis():
    def __init__(self):
        # Initialize a regressor, which will handle the LinearRegression model 
        self.regressor = LinearRegression()
    
    def fit(self, X, y):
        # The regressor learn from the training data with input X and output y
        self.regressor.fit(X, y)
        
    def predict(self, X):
        # The regressor predict the result with input X (after being trained)
        # The output has the same size as output y
        return self.regressor.predict(X)
    
    def mean_square_error(self, y_real, y_predict):
        # Compare the 2 output vectors: real output and prediction, using mean square error
        return mean_squared_error(y_real, y_predict)
    def precision_scored(self, y_real, y_predict):
        # Return the value of the precision scores
        return precision_score(y_real, y_predict,zero_division=0)
    
    def visualize_prediction(self, y_real, y_predict):
        # Visualize the 2 output vectors: real output and prediction with each instances 
        x = np.array([i for i in range(len(y_predict))]) # Numbering the instances from 0 for x-axis of the plot
        plt.plot(x, y_real, label = 'Real')
        plt.plot(x, y_predict, label = 'Predict')
        plt.xlabel('Instances')
        plt.ylabel('Prediction & Real Prices')
        plt.legend()
        plt.show()

In [6]:
class LinearRegressionAnalysis(BaseClassRegressionAnalysis):
    pass
     

In [7]:
class PolynomialRegressionAnalysis(BaseClassRegressionAnalysis):
    def __init__(self, degree):
        super().__init__()
        self.degree = degree
    
    def __poly_transform(self, X):
        poly = PolynomialFeatures(degree=self.degree)
        Xt = poly.fit_transform(X)
        return Xt
    
    def fit(self, X, y, degree=2):
        Xt = self.__poly_transform(X)
        super().fit(Xt, y)
        
    def predict(self, X):
        Xt = self.__poly_transform(X)
        return super().predict(Xt)

In [26]:
# Initialize a regressor (a model) to learn from data
lr = LinearRegressionAnalysis()

# The regressor will learn from the input and output of training data
lr.fit(X_train, y_train)


# After learning from training data, the model will make a prediction based on input testing data
y_pred = [i for i in lr.predict(X_test)]
trend_pred = [int(y_pred[i]>y_pred[i+1]) for i in range(len(y_pred)-1)]
trend_test = [int(y_test[i]>y_test[i+1]) for i in range(len(y_test)-1)]
print(trend_pred[:10])
print(trend_test[:10])
score =0
for i in range(len(y_test)-1):
        if trend_test[i] == trend_pred[i]:
            score +=1
print(score/len(y_test)*2-1)

# Comparision and visualization
print('First 10 instances prediction :     ', np.array([i for i in y_pred[:10]]))
print('Real output of first 10 instances : ', y_test[:10])
#print('Precision of prediction : ', lr.precision_scored(y_test,y_pred))
#lr.visualize_prediction(y1_test, y1_pred)


[0, 1, 0, 1, 0, 1, 1, 0, 0, 1]
[1, 0, 1, 0, 1, 1, 0, 0, 1, 0]
-0.01470460280170538
First 10 instances prediction :      [1491.65463821 1491.84859792 1490.86261511 1491.3620427  1491.15461599
 1491.93587528 1490.95458472 1489.20540279 1491.31918891 1492.41804301]
Real output of first 10 instances :  [1491.8 1490.8 1491.3 1491.1 1491.9 1490.9 1489.2 1491.3 1492.4 1491.7]
