In [4]:
# import libraries
import pandas as pd

#### Read symbols into respective dataframes

In [5]:
# list of tickers we have data for in pkl file
tickers = ['LLY', 'NVO', 'JNJ', 'MRK', 'ABBV', 'MRNA',
           'NVS', 'AZN', 'PFE', 'AMGN', 'PPH', 'IHE', 'PJP']

dfs = {}  # dictionary to store all dataframes

for symbol in tickers:
    # for each symbol, load the pkl file and store it in the dictionary as a df
    dfs[f"{symbol}_df"] = pd.read_pickle(f"pharma-data/day/{symbol}.pkl")

#### Sanity Check
##### Check all the df.head() to ensure they worked

#### Check for null values

In [6]:
for df in dfs:
    print(f"Null values in {df}")
    print(dfs[df].isnull().sum())
    print("\n")

# we see there are no null values anywhere. yfinance dealt with them.

Null values in LLY_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVO_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in JNJ_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in MRK_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in ABBV_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in MRNA_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVS_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in AZN_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume      

In [7]:
dfs['LLY_df'].shape

(1044, 6)

In [8]:
dfs['LLY_df'].dtypes

Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [9]:
# find statistics (range, median, mean, std) for all quantitative columns
for df_name, df in dfs.items():
    # Selecting quantitative columns
    quantitative_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    quantitative_cols = [col for col in quantitative_cols if col in df.columns]

    print(f"Statistics for {df_name}:")
    print(df[quantitative_cols].describe())
    print("\n")

Statistics for LLY_df:
              Open         High          Low        Close        Volume
count  1044.000000  1044.000000  1044.000000  1044.000000  1.044000e+03
mean    301.082471   304.727878   297.585172   301.331355  3.320386e+06
std     148.374251   150.020194   146.573908   148.461820  1.735450e+06
min     120.120003   124.599998   117.059998   119.050003  6.272000e+05
25%     181.742496   183.789997   180.165001   182.207504  2.291475e+06
50%     263.994995   267.585007   261.409988   264.145004  2.836900e+06
75%     360.080002   363.422501   356.392494   360.500000  3.737775e+06
max     790.000000   794.469971   769.119995   782.059998  1.723140e+07


Statistics for NVO_df:
              Open         High          Low        Close        Volume
count  1044.000000  1044.000000  1044.000000  1044.000000  1.044000e+03
mean     56.301964    56.786992    55.859507    56.359741  2.906605e+06
std      23.291443    23.525835    23.122677    23.370771  1.787071e+06
min      25.1050

In [10]:
# check starting date is January 5th, 2020
for df_name, df in dfs.items():
    print(f"Starting date for {df_name}: {df.index[0]}")
    print("\n")

Starting date for LLY_df: 2020-01-06 00:00:00


Starting date for NVO_df: 2020-01-06 00:00:00


Starting date for JNJ_df: 2020-01-06 00:00:00


Starting date for MRK_df: 2020-01-06 00:00:00


Starting date for ABBV_df: 2020-01-06 00:00:00


Starting date for MRNA_df: 2020-01-06 00:00:00


Starting date for NVS_df: 2020-01-06 00:00:00


Starting date for AZN_df: 2020-01-06 00:00:00


Starting date for PFE_df: 2020-01-06 00:00:00


Starting date for AMGN_df: 2020-01-06 00:00:00


Starting date for PPH_df: 2020-01-06 00:00:00


Starting date for IHE_df: 2020-01-06 00:00:00


Starting date for PJP_df: 2020-01-06 00:00:00




In [11]:
# export dataframes to /pharma-data/clean-data/stocks
for df_name, df in dfs.items():
    df.to_csv(f"pharma-data/clean-data/stocks/{df_name}.csv")

In [12]:
dfs['LLY_df'].head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-06,131.419998,132.559998,130.940002,132.259995,124.553093,2102900
2020-01-07,131.699997,132.929993,131.699997,132.509995,124.788521,2448300
2020-01-08,132.460007,134.210007,132.009995,133.710007,125.91861,5188600
2020-01-09,134.550003,136.360001,134.009995,135.919998,127.999825,4522800
2020-01-10,135.779999,138.270004,135.529999,138.0,129.958572,4177600
