In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt

In [10]:
united_performance = pd.read_csv('UNH.csv')
anthem_performance = pd.read_csv('ANTM.csv')

# Statistical Testing
#### Research question: Is Anthem Health or United Health a "better" investment? 
* Using complaint data to formulate a hypothesis
* Check this using historical stock prices and sharecounts to normalize values
    * Stock price * diluted average share count = current company value
* Look at average company value change from one week to the next
#### Considerations and Assumptions: 
* Stock prices are notoriously difficult to predict, and there are a myriad of other factors that are not taken into consideration here. 
* Investors look at factors unrelated to stock prices, which again are not considered here
* The purpose of the statistical testing is to demonstrate how a hypothesis might be formed and tested, using complaint data to form a prior distribution
* The 4 year historical stock data used is considered a sample from the population of stock data for each company since inception

## Hypothesis
H0: Company value growth will be approximately equal for Anthem and United. 

Ha: Anthem will have slower company value growth than United

In [20]:
# checking dataset for null values, weird rows
united_performance.info()
anthem_performance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       155 non-null    datetime64[ns]
 1   Adj Close  155 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 2.5 KB


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-01-01,225.000000,234.539993,223.750000,234.339996,225.883575,5084000
1,2018-01-08,233.490005,240.399994,230.729996,239.820007,231.165848,7695900
2,2018-01-15,241.199997,250.679993,240.580002,250.059998,241.036346,5955100
3,2018-01-22,250.149994,258.529999,248.000000,258.190002,248.872971,6036600
4,2018-01-29,256.690002,267.950012,238.619995,239.179993,230.548950,13948800
...,...,...,...,...,...,...,...
149,2020-11-09,334.980011,337.679993,314.429993,332.959991,331.980804,7820700
150,2020-11-16,335.920013,338.200012,304.670013,305.619995,304.721191,7656300
151,2020-11-23,307.700012,317.880005,306.660004,312.799988,311.880066,4186500
152,2020-11-30,311.790009,326.899994,309.339996,323.269989,322.319275,6328000


In [13]:
# share counts taken from company 8k reports, if range was given, middle value used, sources in README
antm_share_counts = {2019: 260300, 2018: 264200, 2017: 267800, 2016: 268100}
UNH_share_counts = {2019: 966000, 2018: 983000, 2017: 985000, 2016: 968000}

In [19]:
# I plan to test the company value growth, which will be calculated by multiplying the adj close amount by
# the diluted average share count. Reformatting the dataframe to reflect this. 
united_performance = united_performance[['Date', 'Adj Close']]
united_performance


# import datetime as dt
# united_performance['Date'] = pd.to_datetime(united_performance['Date'])

# include = united_performance[united_performance['Date'].dt.year == 2018]
# # exclude = df[df['Date'].dt.year != year]
# include

Unnamed: 0,Date,Adj Close
0,2018-01-01,218.339996
1,2018-01-08,218.254105
2,2018-01-15,232.295914
3,2018-01-22,237.183350
4,2018-01-29,221.346939
...,...,...
150,2020-11-16,333.500153
151,2020-11-23,336.728516
152,2020-11-30,348.635681
153,2020-12-07,337.070007
