In [2]:
# import libraries
import pandas as pd

#### Read symbols into respective dataframes

In [3]:
# list of tickers we have data for in pkl file
tickers = ['LLY', 'NVO', 'JNJ', 'MRK', 'ABBV', 'MRNA',
           'NVS', 'AZN', 'PFE', 'AMGN', 'PPH', 'IHE', 'PJP']

dfs = {}  # dictionary to store all dataframes

for symbol in tickers:
    # for each symbol, load the pkl file and store it in the dictionary as a df
    dfs[f"{symbol}_df"] = pd.read_pickle(f"pharma-data/raw-data/stocks-for-overdose/{symbol}.pkl")

#### Sanity Check
##### Check all the df.head() to ensure they worked

#### Check for null values

In [4]:
# print all null values in each dataframe
for df in dfs:
    print(f"Null values in {df}")
    print(dfs[df].isnull().sum())
    print("\n")

# we see there are no null values anywhere. yfinance dealt with them.

Null values in LLY_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVO_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in JNJ_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in MRK_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in ABBV_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in MRNA_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVS_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in AZN_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume      

In [5]:
# check example of shape
dfs['LLY_df'].shape

(3000, 6)

In [6]:
# find statistics (range, median, mean, std) for all quantitative columns
for df_name, df in dfs.items():
    # Selecting quantitative columns
    quantitative_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    quantitative_cols = [col for col in quantitative_cols if col in df.columns]

    print(f"Statistics for {df_name}:")
    print(df[quantitative_cols].describe())
    print("\n")

Statistics for LLY_df:
              Open         High          Low        Close        Volume
count  3000.000000  3000.000000  3000.000000  3000.000000  3.000000e+03
mean    154.850007   156.551967   153.205803   154.960610  4.552904e+06
std     139.272646   140.997428   137.541626   139.385318  4.281182e+06
min      39.279999    39.599998    39.160000    39.180000  6.272000e+05
25%      71.152502    71.767500    70.414999    71.224998  2.765400e+06
50%      85.900002    86.485001    85.240002    85.939999  3.680200e+06
75%     190.102501   192.087498   188.095005   190.650002  5.063950e+06
max     790.000000   794.469971   769.119995   782.059998  7.482250e+07


Statistics for NVO_df:
              Open         High          Low        Close        Volume
count  3000.000000  3000.000000  3000.000000  3000.000000  3.000000e+03
mean     34.134067    34.392269    33.887149    34.156773  3.220037e+06
std      21.522346    21.739242    21.347328    21.571745  2.297689e+06
min      13.0080

In [7]:
# check starting date is January 6th, 2020
for df_name, df in dfs.items():
    print(f"Starting date for {df_name}: {df.index[0]}")
    print("\n")

Starting date for LLY_df: 2012-03-27 00:00:00


Starting date for NVO_df: 2012-03-27 00:00:00


Starting date for JNJ_df: 2012-03-27 00:00:00


Starting date for MRK_df: 2012-03-27 00:00:00


Starting date for ABBV_df: 2013-01-02 00:00:00


Starting date for MRNA_df: 2018-12-07 00:00:00


Starting date for NVS_df: 2012-03-27 00:00:00


Starting date for AZN_df: 2012-03-27 00:00:00


Starting date for PFE_df: 2012-03-27 00:00:00


Starting date for AMGN_df: 2012-03-27 00:00:00


Starting date for PPH_df: 2012-03-27 00:00:00


Starting date for IHE_df: 2012-03-27 00:00:00


Starting date for PJP_df: 2012-03-27 00:00:00




In [8]:
# export dataframes to /pharma-data/clean-data/stocks
for df_name, df in dfs.items():
    df.to_csv(f"pharma-data/clean-data/stocks-for-overdose/{df_name}.csv")