In [22]:
# import libraries
import pandas as pd

#### Read symbols into respective dataframes

In [31]:
# list of tickers we have data for in pkl file
tickers = ['LLY', 'NVO', 'JNJ', 'MRK', 'ABBV', 'ROG.SW',
           'NVS', 'AZN', 'PFE', 'AMGN', 'PPH', 'IHE', 'PJP']

dfs = {}  # dictionary to store all dataframes

for symbol in tickers:
    # for each symbol, load the pkl file and store it in the dictionary as a df
    dfs[f"{symbol}_df"] = pd.read_pickle(f"pharma-data/day/{symbol}.pkl")

#### Sanity Check
##### Check all the df.head() to ensure they worked

#### Check for null values

In [32]:
for df in dfs:
    print(f"Null values in {df}")
    print(dfs[df].isnull().sum())
    print("\n")

# we see there are no null values anywhere. yfinance dealt with them.

Null values in LLY_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVO_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in JNJ_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in MRK_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in ABBV_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in ROG.SW_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVS_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in AZN_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume    

In [33]:
dfs['LLY_df'].shape

(1515, 6)

In [34]:
dfs['LLY_df'].dtypes

Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [28]:
# find statistics (range, median, mean, std) for all quantitative columns
for df_name, df in dfs.items():
    # Selecting quantitative columns
    quantitative_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    quantitative_cols = [col for col in quantitative_cols if col in df.columns]

    print(f"Statistics for {df_name}:")
    print(df[quantitative_cols].describe())
    print("\n")

Statistics for LLY_df:
              Open         High          Low        Close        Volume
count  1515.000000  1515.000000  1515.000000  1515.000000  1.515000e+03
mean    240.682073   243.504465   237.973855   240.876096  4.062558e+06
std     152.736063   154.564030   150.833210   152.834251  4.961228e+06
min      75.610001    76.230003    74.510002    74.760002  6.272000e+05
25%     117.494999   118.400002   116.529999   117.395000  2.425900e+06
50%     187.029999   189.300003   186.559998   187.050003  3.108900e+06
75%     318.505005   323.384995   315.254990   318.065002  4.239850e+06
max     790.000000   794.469971   769.119995   782.059998  7.482250e+07


Statistics for NVO_df:
              Open         High          Low        Close        Volume
count  1515.000000  1515.000000  1515.000000  1515.000000  1.515000e+03
mean     46.484512    46.865819    46.134785    46.525393  2.918150e+06
std      24.257712    24.505824    24.063035    24.324931  1.641584e+06
min      20.8250

In [35]:
# check starting date is January 5th, 2020
for df_name, df in dfs.items():
    print(f"Starting date for {df_name}: {df.index[0]}")
    print("\n")

Starting date for LLY_df: 2018-02-21 00:00:00


Starting date for NVO_df: 2018-02-21 00:00:00


Starting date for JNJ_df: 2018-02-21 00:00:00


Starting date for MRK_df: 2018-02-21 00:00:00


Starting date for ABBV_df: 2018-02-21 00:00:00


Starting date for ROG.SW_df: 2018-02-21 00:00:00


Starting date for NVS_df: 2018-02-21 00:00:00


Starting date for AZN_df: 2018-02-21 00:00:00


Starting date for PFE_df: 2018-02-21 00:00:00


Starting date for AMGN_df: 2018-02-21 00:00:00


Starting date for PPH_df: 2018-02-21 00:00:00


Starting date for IHE_df: 2018-02-21 00:00:00


Starting date for PJP_df: 2018-02-21 00:00:00




In [29]:
# export dataframes to /pharma-data/clean-data/stocks
for df_name, df in dfs.items():
    df.to_csv(f"pharma-data/clean-data/stocks/{df_name}.csv")