In [1]:
# pip install seaborn sklearn scipy matplotlib yellowbrick statsmodels pandas_datareader

In [2]:
from sklearn import *
from ipywidgets import *
from decimal import Decimal
from matplotlib import style
from scipy.stats import f, chi2, t, norm
from yellowbrick.regressor import ResidualsPlot
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import datetime as dt
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error, r2_score
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# ticker symbols
ts = {'^DJI': 'Dow Jones Industrial Average (U.S.)',
      '^GSPC': 'S&P 500 (U.S.)',
      '^IXIC': 'NASDAQ Composite (U.S.)',
      '000001.SS': 'SSE Composite Index (China)',
      '^HSI': 'HANG SENG INDEX (China)',
      '^N225': 'Nikkei 225 (Japan)',
      '^GSPTSE': 'S&P/TSX Composite index (Canada)',
      '^N100': 'EURONEXT 100 (Europe)'
     }
ts_ = {v:k for k,v in ts.items()}

def get_log_return(symbol, start, end):
    if start>=end:
        raise ValueError('Invalid time period!')
    df = web.DataReader(symbol, 'yahoo', start, end)
    df['LogReturn'] = np.log(df['Close']/df['Open'])
    return df['LogReturn']

def mu_s_n(data) -> (int, int, int):            
    return np.mean(data), np.var(data), len(data)

def sci(n): # scientific notation
    return '%.2E' % Decimal(n)
    
def draw_hist_normal(symbol, normal, logReturn, mu, s, start, end):
    style.use('ggplot')
    # set bin width
    iqr=logReturn.quantile(q=0.75)-logReturn.quantile(q=0.25)
    width=2*iqr/(float(logReturn.count())**(1/3))
    bins_=math.ceil((float(logReturn.max())-float(logReturn.min()))/width)
    # draw histogram
    plt.hist([logReturn], bins=bins_, density=True, color=['steelblue'], edgecolor='white')
    plt.ylabel("Frequency")
    plt.xlabel("Log-return")
    plt.legend([symbol])
    plt.title(f'Log-return for {symbol} during {start.date()} to {end.date()}')
    if normal is True:
        # then plot normal probability
        x=np.linspace(min(logReturn),max(logReturn),100)
        plt.plot(x, norm.pdf(x,mu,math.sqrt(s)))
    sm.qqplot(logReturn, line='r')
    plt.title(f'Normal Probability Plot')
    txt = "Note: " + ts[symbol]
    plt.figtext(.5, -.05, txt, ha='center', fontsize=12)
    plt.show()
    
def confidence_interval(symbol, confidence, logReturn, mu, s, n):
    df = n - 1
    alpha = (1 - confidence)/2
    print(color.BOLD + "Confidence interval for mean:" + color.END)
    print(color.GREEN + f"{[sci(mu-t.ppf(1-alpha, df)*math.sqrt(s/n)), sci(mu + t.ppf(1-alpha, df)*math.sqrt(s/n))]}" + color.END)    #different construct method if sample normal
    print(color.BOLD + "Confidence interval for variance:" + color.END)
    print(color.GREEN + f"{[sci(df*s/chi2.ppf(1-alpha,99)),sci(df*s/chi2.ppf(alpha,99))]}" + color.END)
    
    
def test_mu_equality(symbol_1, symbol_2, mu_1, s_1, n_1, mu_2, s_2, n_2, confidence):
    test = (mu_1-mu_2) / np.sqrt(s_1/n_1 + s_2/n_2)
    z = -norm.ppf((1-confidence)/2)
    print(f"For indexes {symbol_1}, {symbol_2}:")
    if test>-z and test<z:
        print(color.GREEN + f"H0: beta1 = beta2, Do not reject H0 at the confidence level of {confidence}." + color.END)
    else:
        print(color.GREEN + f"H0: beta1 = beta2, Reject H0 at the confidence level of {confidence}.\n\n" + color.END)
        
# Integrated functions
def one_stock(Symbol, start=dt.datetime(2015,11,1), end=dt.datetime(2019,11,1), normal=True, confidence=0.95):
    if start>=end:
        raise ValueError('Invalid time period!')
    symbol = ts_[Symbol]
    logReturn = get_log_return(symbol, start, end)
    mu, s, n = mu_s_n(logReturn)
    draw_hist_normal(symbol, normal, logReturn, mu, s, start, end)
    print(color.BOLD + "- Confidence interval for mean and variance:" + color.END)
    confidence_interval(symbol, confidence, logReturn, mu, s, n)
    print(color.BOLD + "\n- Regression of the log-return on time:\n" + color.END)
    y = np.array(logReturn)
    x = np.array(list(range(len(y))))
    x = x[:, np.newaxis]
    X = sm.add_constant(x)
    Y = list(logReturn)

    est = sm.OLS(Y, X)
    est2 = est.fit()
    print(est2.summary())            
    lrModel = linear_model.LinearRegression()       # Set up the model
    lrModel.fit(x, y)    # Train the model
    y_pred = lrModel.predict(x)         # Make predictions using the testing set
    coef = lrModel.coef_
    intercept = lrModel.intercept_
    mse = mean_squared_error(y, y_pred)      # The mean squared error 
    r2 = r2_score(y, y_pred)        # Explained variance score: 1 is perfect prediction 
    print(color.BLUE + 'R2:' + color.END, r2)
    print(color.BLUE + 'Coefficient:' + color.END, coef[0])
    print(color.BLUE + 'Intercept:' + color.END, intercept)
    
    plt.figure()     #regression plot
    plt.xlabel('Time')
    plt.ylabel('Log Return')
    plt.title('Regression Plot', color='black')
    plt.plot(x, y,'r.') 
    plt.plot(x, y_pred, 'steelblue') 
    plt.show() 
    sns.residplot(x, y, color="steelblue")  #residual plot
    plt.title('Residual Plot', color='black') 
    plt.show()
    
def two_stock(Symbol_1, Symbol_2, start=dt.datetime(2015,11,1), end=dt.datetime(2019,11,1), confidence=0.95):
    if start>=end:
        raise ValueError('Invalid time period!')
    symbol_1 = ts_[Symbol_1]
    symbol_2 = ts_[Symbol_2]
    mu_1, s_1, n_1 = mu_s_n(get_log_return(symbol_1, start, end))
    mu_2, s_2, n_2 = mu_s_n(get_log_return(symbol_2, start, end))
    print(color.BOLD + '\n-Test the equality of the two population means:' + color.END)
    test_mu_equality(symbol_1, symbol_2, mu_1, s_1, n_1, mu_2, s_2, n_2, confidence)
    print(color.BOLD + f'\n-Linear Regression of {Symbol_1} on {Symbol_2}\n' + color.END)
    logReturn_1 = get_log_return(symbol_1, start, end)
    logReturn_2 = get_log_return(symbol_2, start, end)
    minLen = min(len(logReturn_1), len(logReturn_2))
    y = np.array(logReturn_2[0: minLen])
    x = np.array(logReturn_1[0: minLen])
    x = x[:, np.newaxis]
    X = sm.add_constant(x)
    Y = list(y)
    est = sm.OLS(Y, X)
    est2 = est.fit()
    print(est2.summary())            
    
    # Set up the model
    lrModel = linear_model.LinearRegression()
    lrModel.fit(x, y)
    y_pred = lrModel.predict(x)
    coef = lrModel.coef_
    intercept = lrModel.intercept_
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(color.BLUE + 'R2:' + color.END, r2)
    print(color.BLUE + 'Coefficient:' + color.END, coef[0])
    print(color.BLUE + 'Intercept:' + color.END, intercept)
    
    plt.figure()     #regression plot
    plt.xlabel('Log Return of {symbol_1}')
    plt.ylabel('Log Return of {symbol_2}')
    plt.title('Regression Plot', color='black')
    plt.plot(x, y,'r.') 
    plt.plot(x, y_pred, 'steelblue') 
    plt.show() 
    sns.residplot(x, y, color="steelblue")  #residual plot
    plt.title('residual plot', color='black') 
    plt.show()
    
def monday_effect(Symbol, start=dt.datetime(2015,11,1), end=dt.datetime(2019,11,1)):
    if start>=end:
        raise ValueError('Invalid time period!')
    symbol = ts_[Symbol]
    style.use('ggplot')
    logReturn = get_log_return(symbol, start, end)
    datelist = [i.date() for i in logReturn.index]
    monday, friday = [], []
    for x in datelist:
        y=x-dt.timedelta(days=3)
        if x.weekday()==0 and y in datelist:
            monday.append(logReturn[x])
            friday.append(logReturn[y])      
            
    X2 = sm.add_constant(friday)
    est = sm.OLS(monday, X2)
    est2 = est.fit()
#     print(est2.summary())            
    res = linear_model.LinearRegression()
    res.fit(np.reshape(friday, [len(friday),1]), np.reshape(monday, [len(monday),1]))
    coef = res.coef_
    intercept = res.intercept_
    monday_pred = res.predict(np.reshape(friday, [len(friday),1]))
    print(color.BLUE + 'R2:' + color.END, metrics.r2_score(monday, monday_pred))
    print(color.BLUE + 'Intercept:' + color.END, intercept[0])
    print(color.BLUE + 'Coefficient:' + color.END, coef[0][0])
    print(color.BLUE + 'Coefficient_pvalue:' + color.END, est2.pvalues[1])
    if est2.pvalues[1]>0.05:
        print(f'\nSince the p value of beta1={round(est2.pvalues[1],3)}> 0.05, Do not reject H0: beta1 = 0.\nThere is {color.RED}{color.BOLD}NO{color.END} significant correlation between the Friday performance and Monday performance of {Symbol}')
    else:
        print(f'\nSince the p value of beta1={round(est2.pvalues[1],3)}< 0.05, Reject H0: beta1 = 0.\nThere is {color.RED}{color.BOLD}significant{color.END} correlation between the Friday performance and Monday performance of {Symbol}')
    
    plt.figure()     #regression plot
    plt.xlabel('Friday')
    plt.ylabel('Monday')
    plt.title('Regression Plot', color='black')
    plt.plot(friday, monday,'r.') 
    plt.plot(friday, monday_pred, 'steelblue') 
    plt.show() 
    sns.residplot(friday, monday, color="steelblue")  #residual plot
    plt.title('Residual Plot', color='black') 
    plt.show()



In [3]:
#Web App
indexes = list(ts_.keys())
Start, End= list(dt.datetime(i,12,1) for i in range(2015,2020)) ,list(dt.datetime(i,12,1) for i in range(2016,2020))

print(color.BOLD + "Given one stock index:" + color.END)
interact(one_stock, Symbol = indexes, start = Start, end = End, confidence = (0,1,0.05), normal = True)

print(color.BOLD + "Given two stock indexes:" + color.END)
interact(two_stock, Symbol_1 = indexes, Symbol_2 = sorted(indexes,reverse=True), start = Start, end = End, confidence = (0,1,0.05))

print(color.BOLD + f'\nTesting Monday effect' + color.END, )
interact(monday_effect, Symbol = indexes, start = Start, end = End)

[1mGiven one stock index:[0m


interactive(children=(Dropdown(description='Symbol', options=('Dow Jones Industrial Average (U.S.)', 'S&P 500 …

[1mGiven two stock indexes:[0m


interactive(children=(Dropdown(description='Symbol_1', options=('Dow Jones Industrial Average (U.S.)', 'S&P 50…

[1m
Testing Monday effect[0m


interactive(children=(Dropdown(description='Symbol', options=('Dow Jones Industrial Average (U.S.)', 'S&P 500 …

<function __main__.monday_effect(Symbol, start=datetime.datetime(2015, 11, 1, 0, 0), end=datetime.datetime(2019, 11, 1, 0, 0))>