In [8]:
# Import libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns # for data visualization
import pingouin as pg # for post-host pairwise test
from statsmodels.stats.anova import AnovaRM # for anova test

In [15]:
#>>> Loading data from Nate's boxplot & anova code

#path for data
daily_master_path = 'Resources/daily_returns_master.csv'
daily_stats_path = 'Resources/average_daily_statistics.csv'
yearly_master_path = 'Resources/yearly_returns_master.csv'
yearly_stats_path = 'Resources/average_yearly_statistics.csv'
kaggle_clean_path = 'Resources/kaggle_clean.csv'

#load data into pandas
daily_master_df = pd.read_csv(daily_master_path)
daily_stats_df = pd.read_csv(daily_stats_path)
yearly_stats_df = pd.read_csv(yearly_stats_path)
yearly_master_df = pd.read_csv(yearly_master_path)
kaggle_clean_df = pd.read_csv(kaggle_clean_path)

#Generate symbol list for each classification
symbols = daily_master_df['symbol'].unique()
ai_symbols = kaggle_clean_df['symbol'].unique()
index_symbols = np.array([sym for sym in symbols if sym not in ai_symbols])

#segment dataframes for each box plot
# daily_master_df has all
ai_daily = daily_master_df[daily_master_df['symbol'].isin(ai_symbols)]
index_daily = daily_master_df[daily_master_df['symbol'].isin(index_symbols)]

ai_only_all_years = [daily_master_df[daily_master_df['symbol'].eq(sym)]['daily_return'] for sym in ai_symbols]
for stock_data in ai_only_all_years:
    stock_data.dropna(inplace= True)

   #Generate a yea column to group by
ai_daily['date'] = pd.to_datetime(ai_daily['date'])
ai_daily['year'] = ai_daily['date'].dt.year

#process the ai data by year to generate a list of daily returns for each year.
ai_comps_dr_grouped_year = ai_daily.groupby('year')
grp_name_yrs = ai_comps_dr_grouped_year.groups.keys()
years_titles = list(grp_name_yrs)

years_data_ai = []
for year in grp_name_yrs:
    temp_df = ai_comps_dr_grouped_year.get_group(year)
    temp_df.dropna(inplace= True)
    years_data_ai.append(temp_df)
    #Generate a yea column to group by
index_daily['date'] = pd.to_datetime(index_daily['date'])
index_daily['year'] = index_daily['date'].dt.year
print(index_daily.shape)

#process the ai data by year to generate a list of daily returns for each year.
index_dr_grouped_year_sym = index_daily.groupby(['year', 'symbol'])
idx_grp_names = index_dr_grouped_year_sym.groups.keys()
year_sym_titles = list(idx_grp_names)
year_sym_titles

#collect each index ETF data grouped by years into seperate lists
years_data_SPY = []
years_data_DOW = []
years_data_INDU = []

for label in year_sym_titles:
    # temp_df = ai_comps_dr_grouped_year.get_group(year)
    # years_data_ai.append(temp_df)
    temp_df = index_dr_grouped_year_sym.get_group(label)
    temp_df.dropna(inplace=True)
    if 'SPY' in label:
        years_data_SPY.append(temp_df)
    elif 'DOW' in label:
        years_data_DOW.append(temp_df)
    elif 'INDU' in label:
        years_data_INDU.append(temp_df)



(3921, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_data.dropna(inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_daily['date'] = pd.to_datetime(ai_daily['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ai_daily['year'] = ai_daily['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pa

In [7]:
## Merging data for ANOVA analysis. Nate's code: 
#declare the data structure of the dataframe
data_dict = {
        'Stock' : [],
        'Year' : [],
        'Avg_daily_return' : []
        }
# declare the dataframe to be added to with correct column names
combined_df = pd.DataFrame(data_dict)

for year_index in [0, 1 , 2, 3, 4, 5]:
    # make the means
    this_year_spy_mean = years_data_SPY[year_index]['daily_return'].mean()
    this_year_dow_mean = years_data_DOW[year_index]['daily_return'].mean()
    this_year_indu_mean = years_data_INDU[year_index]['daily_return'].mean()
    this_year_ai_mean = years_data_ai[year_index]['daily_return'].mean()
    #pick the year
    this_year = years_titles[year_index]
    #make the columns of data
    data_dict['Stock'] = ['SPY', 'DOW', 'INDU', 'AI_avg']
    data_dict['Year'] = [this_year] * 4
    data_dict['Avg_daily_return'] = [this_year_spy_mean, this_year_dow_mean, this_year_indu_mean, this_year_ai_mean]
    #generate the dataframe for this year
    partial_df = pd.DataFrame(data_dict)
    #concat this year with all prev years
    combined_df = pd.concat([combined_df, partial_df])
# display    
combined_df

Unnamed: 0,Stock,Year,Avg_daily_return
0,SPY,2019.0,0.000706
1,DOW,2019.0,0.000695
2,INDU,2019.0,-0.002639
3,AI_avg,2019.0,0.001057
0,SPY,2020.0,0.000816
1,DOW,2020.0,0.000835
2,INDU,2020.0,0.001153
3,AI_avg,2020.0,0.003802
0,SPY,2021.0,0.000984
1,DOW,2021.0,0.000254


In [11]:
####################################################
##--- Stat Analysis #1a: Repeated Measures ANOVA---##
# Null (H0): mu_AI_avg = mu_spy = mu_dow = mu_indu
# Alternative (Ha): at least one mu_i is different
####################################################
# Conduct the repeated measures ANOVA 
print(AnovaRM(data=combined_df, depvar='Avg_daily_return', 
              subject='Stock', within=['Year']).fit()) 



              Anova
     F Value Num DF  Den DF Pr > F
----------------------------------
Year  2.6513 5.0000 15.0000 0.0655



In [31]:
## Get data ready for analysis #2 (just AI companies)

ai_daily_grouped = ai_daily.groupby(["symbol","year"])['daily_return'].mean()
ai_yearly = ai_daily_grouped.reset_index()
ai_yearly = ai_yearly.rename(columns={'symbol': 'Stock', 'year': 'Year', 'daily_return':'Avg_daily_return'})
ai_yearly


Unnamed: 0,Stock,Year,Avg_daily_return
0,AI,2020,0.035769
1,AI,2021,-0.004756
2,AI,2022,-0.002896
3,AI,2023,0.00598
4,AI,2024,-0.000895
5,AMD,2019,0.003357
6,AMD,2020,0.003476
7,AMD,2021,0.002145
8,AMD,2022,-0.002435
9,AMD,2023,0.003724


In [49]:
####################################################
##--- Stat Analysis #1b: Repeated Measures ANOVA---##
# Null (H0): all 10 AI means are equal
# Alternative (Ha): at least one mu_i is different
####################################################
# This only works for balanced data, so remove AI and PATH
ai_yearly_balanced = ai_yearly[~ai_yearly['Stock'].isin(["AI","PATH"]) ]
#df[~df['pod'].isin(['hi','no']) ]
#df.loc[~((df['col_1'] == 1.0) & (df['col_2'] == 0.0)),:]
ai_yearly_balanced
# Conduct the repeated measures ANOVA 
print(AnovaRM(data=ai_yearly_balanced, depvar='Avg_daily_return', 
              subject='Stock', within=['Year']).fit()) 

              Anova
     F Value Num DF  Den DF Pr > F
----------------------------------
Year  8.6322 5.0000 35.0000 0.0000

