In [None]:
# import all libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [None]:
# all used tickers
tickers = ['LLY', 'NVO', 'JNJ', 'MRK', 'ABBV', 'MRNA',
           'NVS', 'AZN', 'PFE', 'AMGN', 'PPH', 'IHE', 'PJP']

dfs = {}

for symbol in tickers:
    # for each symbol, load the pkl file and store it in the dictionary as a df
    dfs[f"{symbol}_df"] = pd.read_csv(f"pharma-data/clean-data/stocks/{symbol}_df.csv", index_col='Date', parse_dates=True)

In [None]:
# read in covid data
covid_df = pd.read_csv("pharma-data/clean-data/us_covid_deaths.csv", index_col='date', parse_dates=True)

In [None]:
# print the first 2 rows of each df
for df_name, df in dfs.items():
    print(df_name + "\n")
    print(df.head(2))
    print("")

In [None]:
# sanity check
covid_df.head()

In [None]:
# prep dict for the merged data
merged_dfs = {}

for df_name, df in dfs.items():
    # convert index to datetime
    df.index = pd.to_datetime(df.index)

    # merge data frames based on date index, using inner join to keep only rows with matching dates. (keeps trading days)
    merged_dfs[df_name] = pd.merge(df, covid_df, how='inner', left_index=True, right_index=True)

In [None]:
# send merged dfs to csv in /merged-dfs

directory = 'pharma-data/merged-dfs'

for df_name, df in merged_dfs.items():
    df.to_csv(f"{directory}/{df_name}-overdose.csv")

In [None]:
# check merge worked
for df_name, df in merged_dfs.items():
    print(df_name + "\n")
    print(df.head())
    print("")

In [None]:
# check for correlation among columns. 
for df_name, df in merged_dfs.items():
    print(df_name + "\n")
    print(df.corr())
    print("")

In [None]:
# Find the pearson correlation coefficient among closing price & deaths for the day
for df_name, df in merged_dfs.items():
    p_coef, p_val = stats.pearsonr(df['Close'], df['US covid deaths'])
    print(f"{df_name} Pearson Coef: {p_coef}" + "\n")

In [None]:
# check for the best fit - linear, quadratic, or cubic between close and US covid deaths
pearson_corr = {}
for df_name, df in merged_dfs.items():
    pearson_corr[df_name] = df['Close'].corr(df['US covid deaths'])

# fit models and calculate r-squared
r_squared_values = {}
for df_name, df in merged_dfs.items():
    x = df['Close'].values.reshape(-1, 1)
    y = df['US covid deaths'].values

    # linear regression
    model_linear = LinearRegression()
    model_linear.fit(x, y)
    y_pred_linear = model_linear.predict(x)
    r_squared_values[f"{df_name}_linear"] = model_linear.score(x, y)

    # quadratic regression
    poly = PolynomialFeatures(degree=2)
    x_poly = poly.fit_transform(x)
    model_quad = LinearRegression()
    model_quad.fit(x_poly, y)
    y_pred_quad = model_quad.predict(x_poly)
    r_squared_values[f"{df_name}_quadratic"] = model_quad.score(x_poly, y)

    # cubic regression
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(x)
    model_cubic = LinearRegression()
    model_cubic.fit(x_poly, y)
    y_pred_cubic = model_cubic.predict(x_poly)
    r_squared_values[f"{df_name}_cubic"] = model_cubic.score(x_poly, y)

# print r-squared values
print("\nR-squared values:")
count = 0
for model_name, r_squared in r_squared_values.items():
    print(f"{model_name}: {r_squared}")
    count+=1

    # after printing the 3 possibilities, print new line
    if count % 3 == 0:
        print("")

In [None]:
# check example shape
merged_dfs['ABBV_df'].shape