In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as sm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Wczytanie i obróbka danych

In [12]:
url = "https://raw.githubusercontent.com/MarylaSosna/umwf_projekt/main/df_all.csv"
df = pd.read_csv(url, sep=",")
df = df.drop(columns="Unnamed: 0")
df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")

In [13]:
df.head()

Unnamed: 0,Close,Comp,Date,ROR_month,ROR_qtr,ROR_half_year,ROR_year,Score,Oil,Gold,USD to Yuan,Category
0,6.163362,UBA,2004-03-24,0.083411,,,,0.741122,37.009998,417.200012,8.267201,8
1,7.158419,UBA,2005-06-15,0.087692,0.098211,0.03207,0.254842,0.803913,55.57,429.100006,8.266501,8
2,7.281594,UBA,2005-06-29,0.045381,0.13405,0.014216,0.223035,0.80662,57.259998,,8.266501,8
3,7.60522,UBA,2005-07-13,0.066313,0.228741,0.148893,0.219617,0.923381,60.009998,423.899994,8.266501,8
4,7.613519,UBA,2005-07-27,0.062501,0.249711,0.149382,0.305484,0.92327,59.110001,424.700012,8.1028,8


In [14]:
df_clear = df.copy()
df_clear = df.dropna()
df.shape

# Regresja liniowa

W regresji liniowej za zmienne objaśniające przyjumujemy 'Score', 'Oil', 'Gold', 'USD to Yuan', 'Category' oraz ewentualnie 'Close' w zależności, czy wpływa to pozytywnie na wyniki (a dzieje się tak w przypadku horyzontów miesięcznego i kwartalnego). 

Na próbę testową składa się 30% obserwacji.

Prognozujemy wsyokość stopy zwrotu w odpowiednim horyzoncie (zmienne ROR_).

In [85]:
def choice_of_variables(y:str):
    x_cols = [col for col in df_clear.columns if col != y if not col.startswith("ROR_")]
    x_cols.remove("Date")
    x_cols.remove("Comp")
    return x_cols

In [86]:
a = choice_of_variables("ROR_year")
print(a)

['Close', 'Score', 'Oil', 'Gold', 'USD to Yuan', 'Category']


In [87]:
def do_regression(y=y, x_cols=x_cols):
    X_train, X_test, y_train, y_test = train_test_split(df_clear[x_cols], df_clear[y], test_size=0.3, random_state=42)
    print("Dane treningowe:\n ", "X: ", X_train.shape, "y: ", y_train.shape,
         "\nDane tesowe: \n", "X: ", X_test.shape, "y: ", y_test.shape)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    return y_test, y_pred

In [88]:
def print_result(y_test, y_pred):
    print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_pred), 2)) 
    print("Mean squared error =", round(sm.mean_squared_error(y_test, y_pred), 2)) 
    print("Median absolute error =", round(sm.median_absolute_error(y_test, y_pred), 2)) 
    print("Explain variance score =", round(sm.explained_variance_score(y_test, y_pred), 2)) 
    print("R2 score =", round(sm.r2_score(y_test, y_pred), 4))

## Horyzont miesięczny

In [122]:
# Wybór zmiennych objaśnianych i objaśniających
y = 'ROR_month'
x_cols = choice_of_variables(y)
x_cols.remove("Close")

In [123]:
y_test, y_pred = do_regression(y, x_cols)

Dane treningowe:
  X:  (20020, 5) y:  (20020,) 
Dane tesowe: 
 X:  (8581, 5) y:  (8581,)


In [124]:
print_result(y_test, y_pred)

Mean absolute error = 0.06
Mean squared error = 0.01
Median absolute error = 0.04
Explain variance score = 0.0
R2 score = 0.001


## Horyzont kwartalny

In [104]:
y = "ROR_qtr"
x_cols = choice_of_variables(y)
x_cols.remove("Close")

In [105]:
y_test, y_pred = do_regression(y, x_cols)

Dane treningowe:
  X:  (20020, 5) y:  (20020,) 
Dane tesowe: 
 X:  (8581, 5) y:  (8581,)


In [106]:
print_result(y_test, y_pred)

Mean absolute error = 0.1
Mean squared error = 0.03
Median absolute error = 0.07
Explain variance score = 0.01
R2 score = 0.0121


## Horyzont półroczny

In [110]:
y = 'ROR_half_year'
x_cols = choice_of_variables(y)

In [111]:
y_test, y_pred = do_regression(y, x_cols)

Dane treningowe:
  X:  (20020, 6) y:  (20020,) 
Dane tesowe: 
 X:  (8581, 6) y:  (8581,)


In [112]:
print_result(y_test, y_pred)

Mean absolute error = 0.13
Mean squared error = 0.04
Median absolute error = 0.1
Explain variance score = 0.03
R2 score = 0.0322


## Horyzont roczny

In [116]:
y = 'ROR_year'
x_cols = choice_of_variables(y)

In [117]:
y_test, y_pred = do_regression(y, x_cols)

Dane treningowe:
  X:  (20020, 6) y:  (20020,) 
Dane tesowe: 
 X:  (8581, 6) y:  (8581,)


In [118]:
print_result(y_test, y_pred)

Mean absolute error = 0.17
Mean squared error = 0.06
Median absolute error = 0.13
Explain variance score = 0.09
R2 score = 0.0861
