In [1]:
import os
import pandas as pd
import numpy as np
import datetime as dt
import wrds
import psycopg2
import matplotlib.pyplot as plt
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from scipy import stats
pd.set_option('display.max_columns', None)

In [2]:
def sz_bucket(row):
    if row['me']==np.nan:
        value=''
    elif row['me']<=row['sizemedn']:
        value='S'
    else:
        value='B'
    return value

def bm_bucket(row):
    if 0<=row['beme']<=row['bm30']:
        value = 'L'
    elif row['beme']<=row['bm70']:
        value='M'
    elif row['beme']>row['bm70']:
        value='H'
    else:
        value=''
    return value

def op_bucket(row):
    # 不需要 <= 0
    if  row['op']<=row['op30']:
        value = 'W'
    elif row['op']<=row['op70']:
        value='N'
    elif row['op']>row['op70']:
        value='R'
    else:
        value=''
    return value


def inv_bucket(row):
    # 不需要 <= 0
    if  row['inv']<=row['inv30']:
        value = 'C'
    elif row['inv']<=row['inv70']:
        value='N'
    elif row['inv']>row['inv70']:
        value='A'
    else:
        value=''
    return value


def wavg(group, avg_name, weight_name):
    d = group[avg_name]
    w = group[weight_name]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return np.nan


In [3]:
comp = pd.read_csv("./comp_na_daily_all.csv", parse_dates=['datadate'])
crsp = pd.read_csv("./crsp_monthly.csv", parse_dates=['date'])

In [4]:
comp['datadate'] = pd.to_datetime(comp['datadate'])
comp['year'] = comp['datadate'].dt.year
comp['gvkey'] = comp['gvkey'].astype(str)

comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])
comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps'])
comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps'])
comp['txditc']=comp['txditc'].fillna(0)

comp['be'] = comp['seq'] + comp['txditc'] - comp['ps']
comp['be'] = np.where(comp['be'] > 0, comp['be'], np.nan)
print(comp['be'].head(50))

"""
加入operating income
首先把revt cogs xsga xint中的NaN变成0
"""
comp[['revt','cogs','xsga','xint']] = comp[['revt','cogs','xsga','xint']].fillna(0)
comp['op'] = comp['revt'] - comp['cogs'] - comp['xsga'] - comp['xint']
comp['op'] = comp['op'] / (comp['be'] + comp['mib'])
# print(comp)

"""
我需要计算年at growth rate
"""
comp['lat'] = comp.groupby(['gvkey'])['at'].shift(1)
comp['inv'] = comp['at'] / comp['lat']  -  1
# print(comp)

comp = comp.sort_values(by=['gvkey', 'datadate'])
comp['count'] = comp.groupby(['gvkey']).cumcount()

comp = comp[['gvkey', 'datadate', 'year', 'be', 'count','op','inv']]
print(comp.sort_values(['gvkey','year']).head(50))

0        NaN
1        NaN
2      0.561
3      0.627
4      0.491
5      0.834
6      0.744
7      2.571
8     10.211
9     10.544
10     8.382
11     7.309
12     8.798
13     8.279
14    11.020
15    15.527
16    17.673
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22     7.823
23     9.319
24    13.665
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
30       NaN
31       NaN
32       NaN
33     5.972
34       NaN
35       NaN
36       NaN
37       NaN
38       NaN
39       NaN
40     1.983
41     6.095
42     6.482
43     6.665
44     7.458
45     7.643
46       NaN
47       NaN
48     1.443
49     2.712
Name: be, dtype: float64
        gvkey   datadate  year       be  count        op       inv
0        1000 1961-12-31  1961      NaN      0       NaN       NaN
1        1000 1962-12-31  1962      NaN      1       NaN       NaN
2        1000 1963-12-31  1963    0.561      2  0.046346       NaN
3        1000 1964-12-31  1964    0.627      3  0.149920 

In [5]:
crsp.columns = crsp.columns.str.lower()
crsp['jdate']=crsp['date']+MonthEnd(0)
crsp[['ret', 'retx','dlret']] = crsp[['ret', 'retx','dlret']].apply(pd.to_numeric, errors='coerce')
crsp[['permco','permno','shrcd','exchcd']]=crsp[['permco','permno','shrcd','exchcd']].astype(int)

crsp['retadj'] = (1 + crsp['ret']) * (1 + crsp['dlret']) - 1

crsp['me'] = crsp['prc'].abs() * crsp['shrout']

crsp = crsp.drop(['dlret', 'prc', 'shrout'], axis=1)
crsp = crsp.sort_values(by=['jdate', 'permco', 'me'])

crsp_summe = crsp.groupby(['jdate', 'permco'])['me'].sum().reset_index()

crsp_maxme = crsp.groupby(['jdate', 'permco'])['me'].max().reset_index()
print(crsp_maxme)

crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['jdate', 'permco', 'me'])
# print(crsp1)

crsp1 = crsp1.drop(['me'], axis=1)

# print(crsp_summe)

crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['jdate', 'permco'])

crsp2 = crsp2.sort_values(by=['permno', 'jdate']).drop_duplicates()
# print(crsp2)

             jdate  permco           me
0       1960-01-31      74   20749.5000
1       1960-01-31     267   42066.0000
2       1960-01-31     584   35659.0000
3       1960-01-31     921   31020.0000
4       1960-01-31     994   13369.8750
...            ...     ...          ...
4203667 2023-12-31   59749  551423.1000
4203668 2023-12-31   59750   90108.1975
4203669 2023-12-31   59751   13319.6749
4203670 2023-12-31   59752  152025.4800
4203671 2023-12-31   59753    1040.9750

[4203672 rows x 3 columns]


In [6]:
crsp2['jdate'] = pd.to_datetime(crsp2['jdate'])
crsp2['year'] = crsp2['jdate'].dt.year
crsp2['month'] = crsp2['jdate'].dt.month
decme = crsp2[crsp2['month'] == 12]
decme = decme[['permno', 'date', 'jdate', 'me', 'year']].rename(columns={'me': 'dec_me'})

In [7]:
crsp2['ffdate'] = crsp2['jdate'] + MonthEnd(-6)
crsp2['ffyear'] = crsp2['ffdate'].dt.year
crsp2['ffmonth'] = crsp2['ffdate'].dt.month
crsp2['1+retx'] = 1 + crsp2['retx']
crsp2 = crsp2.sort_values(by=['permno', 'date'])
print(crsp2)

crsp2['cumretx'] = crsp2.groupby(['permno', 'ffyear'])['1+retx'].cumprod()

crsp2['lcumretx'] = crsp2.groupby(['permno'])['cumretx'].shift(1)

crsp2['lme'] = crsp2.groupby(['permno'])['me'].shift(1)

crsp2['count'] = crsp2.groupby(['permno']).cumcount()
crsp2['lme'] = np.where(crsp2['count'] == 0, crsp2['me'] / crsp2['1+retx'], crsp2['lme'])
print(crsp2.sort_values(['permco','date']).head(60))

mebase = crsp2[crsp2['ffmonth'] == 1][['permno', 'ffyear', 'lme']].rename(columns={'lme': 'mebase'})

crsp3 = pd.merge(crsp2, mebase, how='left', on=['permno', 'ffyear'])

crsp3['wt'] = np.where(crsp3['ffmonth'] == 1, crsp3['lme'], crsp3['mebase'] * crsp3['lcumretx'])
crsp3 = crsp3.sort_values('ffyear')
print(crsp3)

         permno       date  shrcd  exchcd  permco       ret      retx  \
1169225   10000 1986-01-31     10       3    7952       NaN       NaN   
1175474   10000 1986-02-28     10       3    7952 -0.257143 -0.257143   
1181734   10000 1986-03-31     10       3    7952  0.365385  0.365385   
1188016   10000 1986-04-30     10       3    7952 -0.098592 -0.098592   
1194308   10000 1986-05-30     10       3    7952 -0.222656 -0.222656   
...         ...        ...    ...     ...     ...       ...       ...   
4175572   93436 2023-08-31     11       3   53453 -0.034962 -0.034962   
4181866   93436 2023-09-29     11       3   53453 -0.030456 -0.030456   
4188144   93436 2023-10-31     11       3   53453 -0.197346 -0.197346   
4194402   93436 2023-11-30     11       3   53453  0.195379  0.195379   
4200623   93436 2023-12-29     11       3   53453  0.034988  0.034988   

             jdate  retadj            me  year  month     ffdate  ffyear  \
1169225 1986-01-31     NaN  1.610000e+04  1986 

In [8]:
decme['year'] = decme['year'] + 1
decme = decme[['permno', 'year', 'dec_me']]

crsp3_jun = crsp3[crsp3['month'] == 6]

crsp_jun = pd.merge(crsp3_jun, decme, how='inner', on=['permno', 'year'])
crsp_jun = crsp_jun[
    ['permno', 'date', 'jdate', 'shrcd', 'exchcd', 'retadj', 'me', 'wt', 'cumretx', 'mebase', 'lme', 'dec_me']]
crsp_jun = crsp_jun.sort_values(by=['permno', 'jdate'])

In [12]:
ccm = pd.read_csv("./ccm.csv")
ccm.columns = ccm.columns.str.lower()

ccm['gvkey'] = ccm['gvkey'].astype(str)
ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today'))
ccm['permno'] = ccm['lpermno'].astype(int)

ccm1 = pd.merge(comp[['gvkey', 'datadate', 'be', 'count','op','inv']], ccm, how='left', on=['gvkey'])
ccm1['yearend'] = ccm1['datadate'] + YearEnd(0)
ccm1['jdate'] = ccm1['yearend'] + MonthEnd(6)

ccm1['linkdt'] = pd.to_datetime(ccm1['linkdt'],errors='coerce')
ccm1['linkenddt'] = pd.to_datetime(ccm1['linkenddt'], errors='coerce')
ccm1['jdate'] = pd.to_datetime(ccm1['jdate'])

ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])]
print(ccm2.head(50))


      gvkey   datadate        be  count        op       inv linkprim linktype  \
9      1000 1970-12-31    10.544      9  0.430197  0.165018        P       LU   
10     1000 1971-12-31     8.382     10  0.166428 -0.123169        P       LU   
11     1000 1972-12-31     7.309     11  0.454919 -0.321275        P       LU   
12     1000 1973-12-31     8.798     12  0.432939  0.093635        P       LU   
13     1000 1974-12-31     8.279     13  0.483513  0.177622        P       LU   
14     1000 1975-12-31    11.020     14  0.504265 -0.067595        P       LU   
15     1000 1976-12-31    15.527     15  0.499517  0.614139        P       LU   
16     1000 1977-12-31    17.673     16  0.246534  0.140958        P       LU   
82   100004 2001-12-31  1073.960      1  0.432506  0.126266        P       LC   
83   100004 2002-12-31  1338.202      2  0.490188  0.256755        P       LC   
84   100004 2003-12-31  1862.740      3  0.407857  0.340720        P       LC   
85   100004 2004-12-31  2259

In [13]:
ccm2 = ccm2[['gvkey', 'permno', 'datadate', 'yearend', 'jdate', 'be', 'count','op','inv']]

ccm2['permno'] = ccm2['permno'].astype(int)
crsp_jun['jdate'] = pd.to_datetime(crsp_jun['jdate'])

ccm_jun = pd.merge(crsp_jun, ccm2, how='inner', on=['permno', 'jdate'])

ccm_jun['beme'] = ccm_jun['be'] * 1000 / ccm_jun['dec_me']


In [14]:
nyse = ccm_jun[(ccm_jun['exchcd'].isin([1])) & (ccm_jun['beme'] > 0) & (ccm_jun['me'] > 0) & \
               (ccm_jun['count'] >= 1) & ((ccm_jun['shrcd'] == 10) | (ccm_jun['shrcd'] == 11))]

nyse_sz = nyse.groupby(['jdate'])['me'].median().to_frame().reset_index().rename(columns={'me': 'sizemedn'})

nyse_bm = nyse.groupby(['jdate'])['beme'].describe(percentiles=[0.3, 0.7]).reset_index()
nyse_bm = nyse_bm[['jdate', '30%', '70%']].rename(columns={'30%': 'bm30', '70%': 'bm70'})


  sqr = _ensure_numeric((avg - values) ** 2)


In [15]:
nyse_op = nyse.groupby(['jdate'])['op'].describe(percentiles=[0.3, 0.7]).reset_index()
nyse_op = nyse_op[['jdate', '30%', '70%']].rename(columns={'30%': 'op30', '70%': 'op70'})

nyse_inv = nyse.groupby(['jdate'])['inv'].describe(percentiles=[0.3, 0.7]).reset_index()
nyse_inv = nyse_inv[['jdate', '30%', '70%']].rename(columns={'30%': 'inv30', '70%': 'inv70'})

nyse_breaks = pd.merge(nyse_sz, nyse_bm, how='inner', on='jdate')
nyse_breaks = pd.merge(nyse_breaks, nyse_op, how='inner', on='jdate')
nyse_breaks = pd.merge(nyse_breaks, nyse_inv, how='inner', on='jdate')

ccm1_jun = pd.merge(ccm_jun, nyse_breaks, how='left', on=['jdate'])

ccm1_jun['szport'] = np.where((ccm1_jun['beme'] > 0) & (ccm1_jun['me'] > 0) & (ccm1_jun['count'] >= 1),
                              ccm1_jun.apply(sz_bucket, axis=1), '')

ccm1_jun['bmport'] = np.where((ccm1_jun['beme'] > 0) & (ccm1_jun['me'] > 0) & (ccm1_jun['count'] >= 1),
                              ccm1_jun.apply(bm_bucket, axis=1), '')

ccm1_jun['opport'] = np.where((ccm1_jun['beme'] > 0) & (ccm1_jun['me'] > 0) & (ccm1_jun['count'] >= 1),
                              ccm1_jun.apply(op_bucket, axis=1), '')

ccm1_jun['invport'] = np.where((ccm1_jun['beme'] > 0) & (ccm1_jun['me'] > 0) & (ccm1_jun['count'] >= 1),
                               ccm1_jun.apply(inv_bucket, axis=1), '')

ccm1_jun['posbm'] = np.where((ccm1_jun['beme'] > 0) & (ccm1_jun['me'] > 0) & (ccm1_jun['count'] >= 1), 1, 0)
ccm1_jun['nonmissport'] = np.where((ccm1_jun['bmport'] != ''), 1, 0)

june = ccm1_jun[['permno', 'date', 'jdate', 'bmport', 'szport', 'posbm','opport','invport', 'nonmissport']]
june['ffyear'] = june['jdate'].dt.year

print(june)

        permno       date      jdate bmport szport  posbm opport invport  \
0        10000 1987-06-30 1987-06-30                    0                  
1        10001 1987-06-30 1987-06-30                    0                  
2        10001 1988-06-30 1988-06-30      H      S      1      W       C   
3        10001 1989-06-30 1989-06-30      H      S      1      W       C   
4        10001 1990-06-29 1990-06-30      M      S      1      N       A   
...        ...        ...        ...    ...    ...    ...    ...     ...   
241099   93433 2013-06-28 2013-06-30      M      S      1      R       C   
241100   93433 2014-06-30 2014-06-30      L      S      1      W       C   
241101   93433 2015-06-30 2015-06-30                    0                  
241102   93433 2016-06-30 2016-06-30                    0                  
241103   93435 2011-06-30 2011-06-30      M      S      1      N       A   

        nonmissport  ffyear  
0                 0    1987  
1                 0    1987

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  june['ffyear'] = june['jdate'].dt.year


In [17]:
crsp3 = crsp3[['date', 'permno', 'shrcd', 'exchcd', 'retadj', 'me', 'wt', 'cumretx', 'ffyear', 'jdate']]
ccm3 = pd.merge(crsp3,
                june[['permno', 'ffyear', 'szport', 'bmport','opport','invport', 'posbm', 'nonmissport']], how='left',
                on=['permno', 'ffyear'])

ccm4 = ccm3[(ccm3['wt'] > 0) & (ccm3['posbm'] == 1) & (ccm3['nonmissport'] == 1) &
            ((ccm3['shrcd'] == 10) | (ccm3['shrcd'] == 11))]

vwret_szbmopinv = ccm4.groupby(['jdate', 'szport', 'bmport','opport','invport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
    columns={0: 'vwret'})

vwret_szbm =ccm4.groupby(['jdate', 'szport', 'bmport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
    columns={0: 'vwret'})
vwret_szbm['szbmport'] = vwret_szbm['szport'] + vwret_szbm['bmport']

vwret_szop = ccm4.groupby(['jdate', 'szport', 'opport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
    columns={0: 'vwret'})
vwret_szop['szopport'] = vwret_szop['szport'] + vwret_szop['opport']

vwret_szinv = ccm4.groupby(['jdate', 'szport', 'invport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
    columns={0: 'vwret'})
vwret_szinv['szinvport'] = vwret_szinv['szport'] + vwret_szinv['invport']


szbm_factors = vwret_szbm.pivot(index='jdate', columns='szbmport', values='vwret').reset_index()
szop_factors = vwret_szop.pivot(index='jdate', columns='szopport', values='vwret').reset_index()
szinv_factors = vwret_szinv.pivot(index='jdate', columns='szinvport', values='vwret').reset_index()

vwret_n_szbm =ccm4.groupby(['jdate', 'szport', 'bmport'])['retadj'].count().reset_index().rename(columns={'retadj':'n_firms'})
vwret_n_szbm['szbmport']=vwret_n_szbm['szport']+vwret_n_szbm['bmport']
print(vwret_n_szbm)

vwret_n_szop =ccm4.groupby(['jdate', 'szport', 'opport'])['retadj'].count().reset_index().rename(columns={'retadj':'n_firms'})
vwret_n_szop['szopport']=vwret_n_szop['szport']+vwret_n_szop['opport']

vwret_n_szinv =ccm4.groupby(['jdate', 'szport', 'invport'])['retadj'].count().reset_index().rename(columns={'retadj':'n_firms'})
vwret_n_szinv['szinvport']=vwret_n_szinv['szport']+vwret_n_szinv['invport']

szbm_n_firms = vwret_n_szbm.pivot(index='jdate', columns='szbmport', values='n_firms').reset_index()
szop_n_firms = vwret_n_szop.pivot(index='jdate', columns='szopport', values='n_firms').reset_index()
szinv_n_firms = vwret_n_szinv.pivot(index='jdate', columns='szinvport', values='n_firms').reset_index()

  vwret_szbmopinv = ccm4.groupby(['jdate', 'szport', 'bmport','opport','invport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
  vwret_szbm =ccm4.groupby(['jdate', 'szport', 'bmport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
  vwret_szop = ccm4.groupby(['jdate', 'szport', 'opport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(
  vwret_szinv = ccm4.groupby(['jdate', 'szport', 'invport']).apply(wavg, 'retadj', 'wt').to_frame().reset_index().rename(


          jdate szport bmport  n_firms szbmport
0    1961-07-31      S      L        0       SL
1    1961-08-31      S      L        0       SL
2    1961-09-30      S      L        0       SL
3    1961-10-31      S      L        0       SL
4    1961-11-30      S      L        0       SL
...         ...    ...    ...      ...      ...
4435 2023-12-31      B      L        0       BL
4436 2023-12-31      B      M        0       BM
4437 2023-12-31      S      H        0       SH
4438 2023-12-31      S      L        0       SL
4439 2023-12-31      S      M        0       SM

[4440 rows x 5 columns]


In [18]:
ff_nfirms = pd.DataFrame()
ff_factors = pd.DataFrame()

In [19]:
#SMB(B/M) = 1/3(small value + small Neutral + Small Growth) - 1/3(big value +big neutral + big growth)

ff_factors['SMB(B/M)'] = 1/3 * (szbm_factors['SH'] + szbm_factors["SM"] + szbm_factors['SL'])- 1/3 * (szbm_factors['BH'] + szbm_factors["BM"] + szbm_factors['BL'])

#SMB(OP) = 1/3(small robust + small Neutral + Small weak) - 1/3(big robust +big neutral + big weak)

ff_factors['SMB(OP)'] = 1 / 3 * (szop_factors['SR'] + szop_factors["SN"] + szop_factors['SW']) - 1 / 3 * (
        szop_factors['BR'] + szop_factors["BN"] + szop_factors['BW'])

#SMB(INV) = 1/3(small conservative + small Neutral + Small aggresive) - 1/3(big conservative +big neutral + big aggresive)
ff_factors['SMB(INV)'] = 1 / 3 * (szinv_factors['SC'] + szinv_factors["SN"] + szinv_factors['SA']) - 1 / 3 * (
        szinv_factors['BC'] + szinv_factors["BN"] + szinv_factors['BA'])

ff_factors['WSMB'] = 1/3 *  ff_factors['SMB(B/M)'] + 1/3 * ff_factors['SMB(OP)'] + 1/3 *  ff_factors['SMB(INV)']
ff_factors['date'] = szbm_factors['jdate']

#HML = 1/2(small value + big value) - 1/2(small growth + big growth)
ff_factors['WHML'] = 1/2 * (szbm_factors['SH'] + szbm_factors['BH']) - 1/2 * (szbm_factors['SL'] + szbm_factors['BL'])

#RMW = 1/2(small robust + big robust) - 1/2(small weak + big weak)

ff_factors['WRMW'] = 1/2 * (szop_factors['SR'] + szop_factors['BR']) - 1/2 * (szop_factors['SW'] + szop_factors['BW'])

#CMA = 1/2(small conservative + big conservative) - 1/2(small aggresive + big aggresive)
ff_factors['WCMA'] = 1 / 2 * (szinv_factors['SC'] + szinv_factors['BC']) - 1 / 2 * (
        szinv_factors['SA'] + szinv_factors['BA'])

#n_firms
ff_nfirms['B'] = szbm_n_firms['BH'] + szbm_n_firms['BM'] + szbm_n_firms['BL']
ff_nfirms['S'] = szbm_n_firms['SH'] + szbm_n_firms['SM'] + szbm_n_firms['SL']
ff_nfirms['SMB'] =  ff_nfirms['B'] +  ff_nfirms['B']

ff_nfirms['H'] = szbm_n_firms['BH'] + szbm_n_firms['SH']
ff_nfirms['L'] = szbm_n_firms['BL'] + szbm_n_firms['SL']
ff_nfirms['HML'] = ff_nfirms['H'] + ff_nfirms['L']

ff_nfirms['R'] = szop_n_firms['BR'] + szop_n_firms['SR']
ff_nfirms['W'] = szop_n_firms['BW'] + szop_n_firms['SW']
ff_nfirms['RMW'] = ff_nfirms['R'] + ff_nfirms['W']

ff_nfirms['C'] = szinv_n_firms['BC'] + szinv_n_firms['SC']
ff_nfirms['A'] = szinv_n_firms['BA'] + szinv_n_firms['SA']
ff_nfirms['CMA'] = ff_nfirms['C'] + ff_nfirms['A']

print(ff_nfirms)

       B    S  SMB    H    L  HML    R    W  RMW    C    A  CMA
0    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
1    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
2    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
3    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
4    NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
..   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
745  2.0  0.0  4.0  0.0  1.0  1.0  1.0  0.0  1.0  0.0  2.0  2.0
746  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  0.0  1.0
747  1.0  0.0  2.0  1.0  0.0  1.0  1.0  0.0  1.0  0.0  1.0  1.0
748  2.0  2.0  4.0  2.0  0.0  2.0  2.0  0.0  2.0  0.0  1.0  1.0
749  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  NaN  0.0  NaN

[750 rows x 12 columns]


In [29]:
# data_file4 = os.path.join('.', "F-F_Research_Data_5_Factors_2x3.csv")
_ff = pd.read_csv('ff_5.csv', header=None, skiprows=4, sep=',', skipinitialspace=True)
_ff.columns = ['date', 'Mkt-RF', 'SMB', 'HML', 'RMW','CMA','RF']

_ff[['Mkt-RF', 'SMB', 'HML','RMW','CMA', 'RF']] = _ff[['Mkt-RF', 'SMB', 'HML', 'RMW','CMA','RF']].apply(pd.to_numeric, errors='coerce')
_ff[['Mkt-RF', 'SMB', 'HML', 'RMW','CMA','RF']] = _ff[['Mkt-RF', 'SMB', 'HML','RMW','CMA', 'RF']].astype(float)
_ff[['sdate']] = _ff[['date']].astype(str)
_ff70_24 = _ff[(_ff['sdate'] <= '202408') & (_ff['sdate'] >= '197001')&(_ff['sdate'].str.len() == 6) ]

_ff70_24['date'] = pd.to_datetime(_ff70_24['date'],format='%Y%m')
_ff70_24 = _ff70_24[['date', 'SMB', 'HML','RMW','CMA',]]
_ff70_24['date'] = _ff70_24['date'] + MonthEnd(0)

_ffcomp70_24 = pd.merge(_ff70_24, ff_factors[['date', 'WSMB','WHML','WRMW','WCMA']], how='inner', on=['date'])
_ffcomp70_24['WSMB'] = _ffcomp70_24['WSMB'] * 100
_ffcomp70_24['WHML'] = _ffcomp70_24['WHML'] * 100
_ffcomp70_24['WRMW'] = _ffcomp70_24['WRMW'] * 100
_ffcomp70_24['WCMA'] = _ffcomp70_24['WCMA'] * 100

diff_SMB = abs(_ffcomp70_24['WSMB'] - _ffcomp70_24['SMB'])
diff_HML = abs(_ffcomp70_24['WHML'] - _ffcomp70_24['HML'])
diff_RMW = abs(_ffcomp70_24['WRMW'] - _ffcomp70_24['RMW'])
diff_CMA = abs(_ffcomp70_24['WCMA'] - _ffcomp70_24['CMA'])

mean_abs_diff_SMB = diff_SMB.mean()
print(mean_abs_diff_SMB)
mean_abs_diff_HML = diff_HML.mean()
print(mean_abs_diff_HML)
mean_abs_diff_RMW = diff_RMW.mean()
print(mean_abs_diff_RMW)
mean_abs_diff_CMA = diff_CMA.mean()
print(mean_abs_diff_CMA)


_ffcomp70_24 = _ffcomp70_24.dropna(subset=['SMB', 'WSMB', 'HML', 'WHML', 'RMW', 'WRMW'])
_ffcomp70_24 = _ffcomp70_24[~_ffcomp70_24[['SMB', 'WSMB', 'HML', 'WHML', 'RMW', 'WRMW']].isin([np.inf, -np.inf]).any(axis=1)]

print(stats.pearsonr(_ffcomp70_24['SMB'], _ffcomp70_24['WSMB']))
print(stats.pearsonr(_ffcomp70_24['HML'], _ffcomp70_24['WHML']))
print(stats.pearsonr(_ffcomp70_24['RMW'], _ffcomp70_24['WRMW']))
print(stats.pearsonr(_ffcomp70_24['CMA'], _ffcomp70_24['WCMA']))

2.2417840912239946
2.2647314876821083
1.5702304250464232
1.5821229614476224
PearsonRResult(statistic=0.010594959569945247, pvalue=0.787943628929086)
PearsonRResult(statistic=0.1031687440471157, pvalue=0.008635508864960176)
PearsonRResult(statistic=0.022481784883094536, pvalue=0.5681234213664652)
PearsonRResult(statistic=0.10702832045484173, pvalue=0.006430932477593392)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _ff70_24['date'] = pd.to_datetime(_ff70_24['date'],format='%Y%m')


In [26]:
_ff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    731 non-null    int64  
 1   Mkt-RF  731 non-null    float64
 2   SMB     731 non-null    float64
 3   HML     731 non-null    float64
 4   RMW     731 non-null    float64
 5   CMA     731 non-null    float64
 6   RF      731 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 40.1 KB
