In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline
import yfinance
from utility import gain, plot_model_on_data, roi, prepare_data, print_eval

In [2]:
nasdaq = yfinance.Ticker("^IXIC")

In [3]:
data = nasdaq.history(start="2011-01-01", end="2021-12-31", actions=False)

In [4]:
open = data["Open"]
close = data["Close"]

features= {
    "Open": open
}

X = pd.DataFrame(features)
y = close

def split_data(X, y, split):
    lowerRange = X.index.year < split
    upperRange = (X.index.year > split-1) & (X.index.year < split+1)
    X_train = X.loc[lowerRange]
    y_train = y.loc[lowerRange]
    X_val = X.loc[upperRange]
    y_val = y.loc[upperRange]
    return X_train, X_val, y_train, y_val

X_train, X_val, y_train, y_val = split_data(X, y, 2012)

In [5]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
model.score(X_val, y_val)

0.9522082278765175

In [6]:
delta = close - open

In [7]:
features = {
    "Open": open,
    "OpenShift1": open.shift(1),
    "OpenShift2": open.shift(2),
    "OpenShift3": open.shift(3),
    "OpenShift4": open.shift(4),
}

In [8]:
X, y = prepare_data(features, delta)
X_train, X_val, y_train, y_val = split_data(X, y, 2021)
model = LinearRegression()
model.fit(X_train, y_train)
print_eval(X_val, y_val, model, open)

Gain: 58.50€
 ROI: 0.407%


## Aggiungiamo Features

In [9]:
def computeMACD (df, n_fast, n_slow, n_smooth):
    data = df['Open']
    
    fastEMA = data.ewm(span=n_fast, min_periods=n_slow).mean()
    slowEMA = data.ewm(span=n_slow, min_periods=n_slow).mean()
    MACD = pd.Series(fastEMA-slowEMA, name = 'MACD')
    df = df.join(MACD)
    
    return df

def calculateSMA(data, ndays): 
    SMA = pd.Series(data['Open'].rolling(ndays).mean(), name = 'SMA') 
    return SMA

def addSMA(first, second):
    sma1 = calculateSMA(data, first)
    sma2 = calculateSMA(data, second)
    
    sma = (np.array(sma2) - np.array(sma1)) / np.array(sma2)
    sma = pd.Series(data=sma, index=data.index)
    data[f"SMA{first}-{second}"] = sma
    features[f"SMA{first}-{second}"] = data[f"SMA{first}-{second}"]
    
    
def calculate_ema(prices, days, smoothing=2):
    ema = [sum(prices[:days]) / days]
    for price in prices[days:]:
        ema.append((price * (smoothing / (1 + days))) + ema[-1] * (1 - (smoothing / (1 + days))))
    return ema

def addEMA(first, second):
    ema1 = calculate_ema(data['Open'], first)
    ema2 = calculate_ema(data['Open'], second)

    ema = (np.array(ema2) - np.array(ema1[(second-first):])) / np.array(ema2)
    ema = pd.Series(data=ema, index=data[(second-1):].index)
    data[f"EMA{first}-{second}"] = ema
    features[f"EMA{first}-{second}"] = data[f"EMA{first}-{second}"]
    
def rsi(close, periods = 14): # il parametro close intende il valore di chiusura del giorno precedente ovvero quello di apertura del giorno corrente
    close_delta = close.diff()
    # Make two series: one for lower closes and one for higher closes
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    ma_up = up.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    ma_down = down.ewm(com = periods - 1, adjust=True, min_periods = periods).mean()
    rsi = ma_up / ma_down
    rsi = 100 - (100/(1 + rsi))
    return rsi

def gainForMFI(x):
    return ((x > 0) * x).sum()

def lossForMFI(x):
    return ((x < 0) * x).sum()

def mfi(high, low, close, volume, n=14): # il parametro close intende il valore di chiusura del giorno precedente ovvero quello di apertura del giorno corrente
    typical_price = (high + low + close)/3
    money_flow = typical_price * volume
    mf_sign = np.where(typical_price > typical_price.shift(1), 1, -1)
    signed_mf = money_flow * mf_sign
    mf_avg_gain = signed_mf.rolling(n).apply(gainForMFI, raw=True)
    mf_avg_loss = signed_mf.rolling(n).apply(lossForMFI, raw=True)
    return (100 - (100 / (1 + (mf_avg_gain / abs(mf_avg_loss))))).to_numpy()

def atr(high, low, close, n=14):
    tr = np.amax(np.vstack(((high - low).to_numpy(), (abs(high - close)).to_numpy(), (abs(low - close)).to_numpy())).T, axis=1)
    return pd.Series(tr).rolling(n).mean().to_numpy()

def forceIndex(data, ndays): 
    FI = pd.Series(data['Open'].diff(ndays) * data['Volume'], name = 'ForceIndex') 
    return FI


In [10]:
addEMA(5,20)
addEMA(8,15)
addEMA(20,50)

addSMA(5,20)
addSMA(8,15) #aggiungendo questo indice il guadagno diminuisce negli anni "positivi" e aumenta negli anni "negativi"
addSMA(20,50) #aggiungendo questo indice il guadagno diminuisce negli anni "positivi" e aumenta negli anni "negativi"

data["RSI"] = rsi(data["Open"])
data = computeMACD(data, 12, 26, 9)
data["MFI"] = mfi(data["High"], data["Low"], data["Open"], data["Volume"], 14)
data["ATR"] = atr(data["High"], data["Low"], data["Open"], 14)
data["FI-13"] = forceIndex(data, 13)
data["FI-50"] = forceIndex(data, 13)

features["MACD"] = data["MACD"]
features["RSI"] = data["RSI"] #abbassa i profitti (inutile per ora)
features["MFI"] = data["MFI"] #anche questa sembrerebbe inutile per ora
features["ATR"] = data["ATR"] #STRANO...
features["FI-13"] = data["FI-13"] #assolutamente NO! da risultati orribili (con regressione lineare)
features["FI-50"] = data["FI-50"] #assolutamente NO! da risultati orribili (con regressione lineare)

In [11]:
X, y = prepare_data(features, delta)
X_train, X_val, y_train, y_val = split_data(X, y, 2016)
model = LinearRegression()
model.fit(X_train, y_train)
print_eval(X_val, y_val, model, open)

Gain: 1032.82€
 ROI: 20.715%


Interessante da vedere https://medium.com/codex/this-python-library-will-help-you-get-stock-technical-indicators-in-one-line-of-code-c11ed2c8e45f

In [48]:
model_a = LinearRegression()
model_a.fit(X_train, y_train)
print_eval(X_val, y_val, model_a, open)

Gain: 1032.82€
 ROI: 20.715%


In [49]:
from sklearn.linear_model import Ridge

model_b = Ridge(alpha=10)
model_b.fit(X_train, y_train)
print_eval(X_val, y_val, model_b, open)

Gain: 716.44€
 ROI: 14.369%


In [53]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

model_c = Pipeline([
    ("scale", StandardScaler()),
    ("lr", LinearRegression())
])
model_c.fit(X_train, y_train)
print_eval(X_val, y_val, model_c, open)

Gain: 1032.82€
 ROI: 20.715%


Vediamo quanto influiscono i parametri relativi alle colonne

In [54]:
pd.DataFrame({
    "linear": model_a.coef_,
    "ridge": model_b.coef_,
    "scaled": model_c.named_steps["lr"].coef_
}, index=X.columns)

Unnamed: 0,linear,ridge,scaled
Open,-0.087697,-0.065245,-76.286958
OpenShift1,0.105303,0.097675,91.484772
OpenShift2,-0.029161,-0.039555,-25.306228
OpenShift3,-0.012912,-0.023388,-11.184627
OpenShift4,0.026191,0.030641,22.757247
EMA5-20,1648.46543,0.02787,21.946065
EMA8-15,-4889.39763,-0.023833,-31.896803
EMA20-50,479.925712,0.737086,6.318058
SMA5-20,533.587617,0.413188,9.582465
SMA8-15,-175.957028,-0.146954,-1.733501


Con la standardizzazione delle feature otteniamo valori su scale simili, che possiamo confrontare alla pari:
- ad es. negli altri modelli il coefficiente di EMA8-15
  è alto in valore assoluto perché i valori di tale variabile sono bassi 
 - nel modello con standardizzazione assumono invece più peso il numero di stanze (`RM`) e la distanza dagli uffici di collocamento (`DIS`)

In [55]:
X_train

Unnamed: 0_level_0,Open,OpenShift1,OpenShift2,OpenShift3,OpenShift4,EMA5-20,EMA8-15,EMA20-50,SMA5-20,SMA8-15,SMA20-50,MACD,RSI,MFI,ATR,FI-13,FI-50
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-03-14,2695.659912,2689.649902,2719.290039,2756.340088,2745.229980,0.012064,0.005678,-0.002447,0.017641,0.002947,-0.009129,-11.221753,40.465112,34.532266,38.312116,-1.061045e+11,-1.061045e+11
2011-03-15,2619.399902,2695.659912,2689.649902,2719.290039,2756.340088,0.019585,0.009015,0.000351,0.023363,0.005748,-0.005955,-20.066562,32.190392,34.783352,38.734270,-2.529506e+11,-2.529506e+11
2011-03-16,2652.919922,2619.399902,2695.659912,2689.649902,2719.290039,0.020614,0.009855,0.002103,0.028108,0.009882,-0.003285,-24.136462,38.174913,42.915266,40.754272,-2.576270e+11,-2.576270e+11
2011-03-17,2656.080078,2652.919922,2619.399902,2695.659912,2689.649902,0.020323,0.010049,0.003557,0.029898,0.014453,-0.000706,-26.800452,38.723970,43.006928,40.541417,-2.696374e+11,-2.696374e+11
2011-03-18,2665.540039,2656.080078,2652.919922,2619.399902,2695.659912,0.018680,0.009563,0.004616,0.028992,0.016020,0.001978,-27.837358,40.429464,42.776558,40.182861,-2.494355e+11,-2.494355e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-24,5046.189941,5025.549805,4988.680176,4957.529785,4982.580078,0.002604,0.001884,-0.003935,0.010491,0.002102,-0.003205,-9.005780,51.242905,43.679966,60.837856,-6.593071e+10,-6.593071e+10
2015-12-28,5032.290039,5046.189941,5025.549805,4988.680176,4957.529785,0.001719,0.001373,-0.003784,0.007643,0.000833,-0.001389,-7.625737,50.023784,37.270356,59.741455,-2.389312e+10,-2.389312e+10
2015-12-29,5066.520020,5032.290039,5046.189941,5025.549805,4988.680176,-0.000489,0.000326,-0.004020,0.002604,-0.000958,0.000105,-3.727007,52.989872,43.370579,58.677176,-1.478921e+10,-1.478921e+10
2015-12-30,5101.180176,5066.520020,5032.290039,5046.189941,5025.549805,-0.003439,-0.001063,-0.004596,-0.002139,-0.000633,0.001289,2.134940,55.847369,49.690186,53.707171,9.341551e+10,9.341551e+10
