In [2]:
# import libraries
import pandas as pd

#### Read symbols into respective dataframes

In [3]:
# list of tickers we have data for in pkl file
tickers = ['LLY', 'NVO', 'JNJ', 'MRK', 'ABBV', 'ROG.SW',
           'NVS', 'AZN', 'PFE', 'AMGN', 'PPH', 'IHE', 'PJP']

dfs = {}  # dictionary to store all dataframes

for symbol in tickers:
    # for each symbol, load the pkl file and store it in the dictionary as a df
    dfs[f"{symbol}_df"] = pd.read_pickle(f"pharma-data/day/{symbol}.pkl")

#### Sanity Check
##### Check all the df.head() to ensure they worked

In [4]:
# for each dataframe in the dfs dictionary, print the first 5 rows & shape
for df in dfs:
    print(f"First 5 rows of {df}")
    print(dfs[df].shape)
    print(dfs[df].head())
    print("\n")


First 5 rows of LLY_df
(2501, 6)
                 Open       High        Low      Close  Adj Close   Volume
Date                                                                      
2014-03-13  59.240002  59.330002  58.230000  58.340000  47.321117  4103700
2014-03-14  58.310001  58.980000  58.230000  58.869999  47.751007  4201800
2014-03-17  59.119999  59.290001  58.650002  58.869999  47.751007  3016200
2014-03-18  58.910000  59.700001  58.810001  59.400002  48.180893  4082800
2014-03-19  59.349998  59.740002  58.630001  59.049999  47.897015  2865900


First 5 rows of NVO_df
(2501, 6)
                 Open       High        Low      Close  Adj Close   Volume
Date                                                                      
2014-03-13  23.195000  23.219999  22.750000  22.855000  15.467833  4562200
2014-03-14  22.610001  22.889999  22.584999  22.775000  15.413692  5663000
2014-03-17  22.879999  23.100000  22.860001  22.905001  15.501674  2049200
2014-03-18  22.955000  23.209999

#### Check for null values

In [5]:
for df in dfs:
    print(f"Null values in {df}")
    print(dfs[df].isnull().sum())
    print("\n")

# we see there are no null values anywhere. yfinance dealt with them.

Null values in LLY_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVO_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in JNJ_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in MRK_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in ABBV_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in ROG.SW_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in NVS_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


Null values in AZN_df
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume    

In [6]:
dfs['LLY_df'].shape

(2501, 6)

In [7]:
dfs['LLY_df'].dtypes

Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [11]:
# find statistics (range, median, mean, std) for all quantitative columns
for df_name, df in dfs.items():
    # Selecting quantitative columns
    quantitative_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    quantitative_cols = [col for col in quantitative_cols if col in df.columns]

    print(f"Statistics for {df_name}:")
    print(df[quantitative_cols].describe())
    print("\n")

Statistics for LLY_df:
              Open         High          Low        Close        Volume
count  2501.000000  2501.000000  2501.000000  2501.000000  2.501000e+03
mean    173.807853   175.753342   171.938772   173.942543  4.223526e+06
std     140.319312   142.064415   138.604125   140.473430  4.228324e+06
min      57.389999    57.840000    56.810001    57.209999  6.272000e+05
25%      79.879997    80.449997    79.050003    79.779999  2.633000e+06
50%     112.260002   113.419998   111.099998   112.250000  3.424700e+06
75%     236.699997   239.369995   234.029999   236.690002  4.633500e+06
max     771.030029   794.469971   763.500000   782.059998  7.482250e+07


Statistics for NVO_df:
              Open         High          Low        Close        Volume
count  2501.000000  2501.000000  2501.000000  2501.000000  2.501000e+03
mean     37.312655    37.600112    37.039068    37.338405  3.184814e+06
std      21.527983    21.747451    21.354120    21.580409  2.156144e+06
min      15.5350

In [12]:
# export dataframes to /pharma-data/clean-data/stocks
for df_name, df in dfs.items():
    df.to_csv(f"pharma-data/clean-data/stocks/{df_name}.csv")

OSError: Cannot save file into a non-existent directory: 'pharma-data/clean-data/stocks'