In [1]:
# import all libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [2]:
# all used tickers
tickers = ['LLY', 'NVO', 'JNJ', 'MRK', 'ABBV', 'MRNA',
           'NVS', 'AZN', 'PFE', 'AMGN', 'PPH', 'IHE', 'PJP']

dfs = {}

for symbol in tickers:
    # for each symbol, load the pkl file and store it in the dictionary as a df
    dfs[f"{symbol}_df"] = pd.read_csv(f"pharma-data/clean-data/stocks/{symbol}_df.csv", index_col='Date', parse_dates=True)

In [3]:
# read in covid data
covid_df = pd.read_csv("pharma-data/clean-data/us_covid_deaths.csv", index_col='date', parse_dates=True)

In [4]:
# print the first 2 rows of each df
for df_name, df in dfs.items():
    print(df_name + "\n")
    print(df.head(2))
    print("")

LLY_df

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2020-01-06  131.419998  132.559998  130.940002  132.259995  124.553093   
2020-01-07  131.699997  132.929993  131.699997  132.509995  124.788521   

             Volume  
Date                 
2020-01-06  2102900  
2020-01-07  2448300  

NVO_df

                 Open   High        Low      Close  Adj Close   Volume
Date                                                                  
2020-01-06  28.495001  28.59  28.434999  28.504999  26.977007  2495000
2020-01-07  28.500000  28.50  28.280001  28.475000  26.948616  2080600

JNJ_df

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2020-01-06  144.000000  144.199997  142.850006  144.100006  128.432449   
2020-01-07  144.009995  145.449997  141.380005  144.979996  129.216751   

        

In [5]:
# sanity check
covid_df.head()

Unnamed: 0_level_0,US covid deaths
date,Unnamed: 1_level_1
2020-01-06,0.0
2020-01-07,0.0
2020-01-08,0.0
2020-01-09,0.0
2020-01-10,0.0


In [6]:
# prep dict for the merged data
merged_dfs = {}

for df_name, df in dfs.items():
    # convert index to datetime
    df.index = pd.to_datetime(df.index)

    # merge data frames based on date index, using inner join to keep only rows with matching dates. (keeps trading days)
    merged_dfs[df_name] = pd.merge(df, covid_df, how='inner', left_index=True, right_index=True)

In [7]:
# send merged dfs to csv in /merged-dfs

directory = 'pharma-data/merged-dfs'

for df_name, df in merged_dfs.items():
    df.to_csv(f"{directory}/{df_name}-covid.csv")

In [8]:
# check merge worked
for df_name, df in merged_dfs.items():
    print(df_name + "\n")
    print(df.head())
    print("")

LLY_df

                  Open        High         Low       Close   Adj Close  \
2020-01-06  131.419998  132.559998  130.940002  132.259995  124.553093   
2020-01-07  131.699997  132.929993  131.699997  132.509995  124.788521   
2020-01-08  132.460007  134.210007  132.009995  133.710007  125.918610   
2020-01-09  134.550003  136.360001  134.009995  135.919998  127.999825   
2020-01-10  135.779999  138.270004  135.529999  138.000000  129.958572   

             Volume  US covid deaths  
2020-01-06  2102900              0.0  
2020-01-07  2448300              0.0  
2020-01-08  5188600              0.0  
2020-01-09  4522800              0.0  
2020-01-10  4177600              0.0  

NVO_df

                 Open       High        Low      Close  Adj Close   Volume  \
2020-01-06  28.495001  28.590000  28.434999  28.504999  26.977007  2495000   
2020-01-07  28.500000  28.500000  28.280001  28.475000  26.948616  2080600   
2020-01-08  28.455000  28.594999  28.334999  28.424999  26.901297  182

In [9]:
# check for correlation among columns. 
for df_name, df in merged_dfs.items():
    print(df_name + "\n")
    print(df.corr())
    print("")

LLY_df

                     Open      High       Low     Close  Adj Close    Volume  \
Open             1.000000  0.999693  0.999671  0.999406   0.999402 -0.156496   
High             0.999693  1.000000  0.999622  0.999743   0.999726 -0.146882   
Low              0.999671  0.999622  1.000000  0.999727   0.999725 -0.162601   
Close            0.999406  0.999743  0.999727  1.000000   0.999981 -0.153346   
Adj Close        0.999402  0.999726  0.999725  0.999981   1.000000 -0.153289   
Volume          -0.156496 -0.146882 -0.162601 -0.153346  -0.153289  1.000000   
US covid deaths -0.511921 -0.512251 -0.512182 -0.512172  -0.513137  0.061128   

                 US covid deaths  
Open                   -0.511921  
High                   -0.512251  
Low                    -0.512182  
Close                  -0.512172  
Adj Close              -0.513137  
Volume                  0.061128  
US covid deaths         1.000000  

NVO_df

                     Open      High       Low     Close  Adj C

In [13]:
# Find the pearson correlation coefficient among closing price & deaths for the day
for df_name, df in merged_dfs.items():
    p_coef, p_val = stats.pearsonr(df['Close'], df['US covid deaths'])
    print(f"{df_name} Pearson Coef: {p_coef}" + "\n")

LLY_df Pearson Coef: -0.5121716540967605

NVO_df Pearson Coef: -0.4985533503789554

JNJ_df Pearson Coef: -0.008869413889813063

MRK_df Pearson Coef: -0.5728502386972149

ABBV_df Pearson Coef: -0.3733985681186126

MRNA_df Pearson Coef: 0.2442945848603412

NVS_df Pearson Coef: -0.3096125142158657

AZN_df Pearson Coef: -0.4539224121053649

PFE_df Pearson Coef: 0.1659947042720218

AMGN_df Pearson Coef: -0.3599498214909606

PPH_df Pearson Coef: -0.3095000838986556

IHE_df Pearson Coef: 0.04414501286893946

PJP_df Pearson Coef: 0.04660566762980671



In [11]:
# check for the best fit - linear, quadratic, or cubic between close and US covid deaths
pearson_corr = {}
for df_name, df in merged_dfs.items():
    pearson_corr[df_name] = df['Close'].corr(df['US covid deaths'])

# fit models and calculate r-squared
r_squared_values = {}
for df_name, df in merged_dfs.items():
    x = df['Close'].values.reshape(-1, 1)
    y = df['US covid deaths'].values

    # linear regression
    model_linear = LinearRegression()
    model_linear.fit(x, y)
    y_pred_linear = model_linear.predict(x)
    r_squared_values[f"{df_name}_linear"] = model_linear.score(x, y)

    # quadratic regression
    poly = PolynomialFeatures(degree=2)
    x_poly = poly.fit_transform(x)
    model_quad = LinearRegression()
    model_quad.fit(x_poly, y)
    y_pred_quad = model_quad.predict(x_poly)
    r_squared_values[f"{df_name}_quadratic"] = model_quad.score(x_poly, y)

    # cubic regression
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(x)
    model_cubic = LinearRegression()
    model_cubic.fit(x_poly, y)
    y_pred_cubic = model_cubic.predict(x_poly)
    r_squared_values[f"{df_name}_cubic"] = model_cubic.score(x_poly, y)

# print r-squared values
print("\nR-squared values:")
count = 0
for model_name, r_squared in r_squared_values.items():
    print(f"{model_name}: {r_squared}")
    count+=1

    # after printing the 3 possibilities, print new line
    if count % 3 == 0:
        print("")


R-squared values:
LLY_df_linear: 0.26231980326020865
LLY_df_quadratic: 0.26232153722304064
LLY_df_cubic: 0.3381901550040588

NVO_df_linear: 0.24855544317407852
NVO_df_quadratic: 0.2501606451882905
NVO_df_cubic: 0.2939903282562214

JNJ_df_linear: 7.866650274879206e-05
JNJ_df_quadratic: 0.02584969483737176
JNJ_df_cubic: 0.027685548315376507

MRK_df_linear: 0.3281573959754519
MRK_df_quadratic: 0.3398543224812053
MRK_df_cubic: 0.3471855744494218

ABBV_df_linear: 0.1394264906730287
ABBV_df_quadratic: 0.20577959039479454
ABBV_df_cubic: 0.2530064167545637

MRNA_df_linear: 0.05967984419208561
MRNA_df_quadratic: 0.05992077986828359
MRNA_df_cubic: 0.06006910675619703

NVS_df_linear: 0.0958599089590686
NVS_df_quadratic: 0.17279071785931466
NVS_df_cubic: 0.1987409074367985

AZN_df_linear: 0.20604555621155074
AZN_df_quadratic: 0.2556886498114821
AZN_df_cubic: 0.31392569873269827

PFE_df_linear: 0.027554241846355576
PFE_df_quadratic: 0.03795114670210742
PFE_df_cubic: 0.08800814636740661

AMGN_df_li

In [None]:
# check example shape
merged_dfs['ABBV_df'].shape