In [40]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")

In [41]:
# the stock data
df = pd.read_csv('data/return_predictability_data.csv')

# The industry dataframe
# ind=pd.read_csv('data/industry.csv')

In [None]:
df = df[~np.isnan(df['bm'])]

In [71]:
def calcSMB(df):
    """
    Returns SMB for FF
    """
    # Define Quantile
    SQuantile = 0.3
    LQuantile = 0.7
    df["SMB"] = ""

    # Assigns stock size based on market cap
    df.SMB[df.mvel1.quantile(q=SQuantile) > df.mvel1] = "SCap"
    df.SMB[(df.mvel1 > df.mvel1.quantile(SQuantile)) & (df.mvel1 < df.mvel1.quantile(q=LQuantile))] = "MCap"
    df.SMB[df.mvel1.quantile(q=LQuantile) < df.mvel1] = "LCap"

    # Calculate average return of stocks in portfolio subset based on size
    SmallCapReturn = df.risk_premium.loc[df['SMB'] == "SCap"].mean()
    LargeCapReturn = df.risk_premium.loc[df['SMB'] == "LCap"].mean()

    # Returns SMB based on definition
    SMB = SmallCapReturn - LargeCapReturn
    print(round(SMB, 4))
    return round(SMB, 4)




In [72]:
def calcHML(df):
    """
    Returns HML for FF
    """

    # Assigns stock size based on market cap
    SQuantile = 0.3
    LQuantile = 0.7
    df["HML"] = ""

    #Assign stock size based on market cap
    df.HML[df.bm <= df.bm.quantile(q=SQuantile)] = "SValue"
    df.HML[(df.bm > df.bm.quantile(q=SQuantile)) & (df.bm < df.bm.quantile(q=LQuantile))] = "MValue"
    df.HML[df.bm >= df.bm.quantile(q=LQuantile)] = "LValue"

    #Calculates average return of stocks in portfolio subset based on size
    SmallValueReturn = df.risk_premium.loc[df['HML'] == "SValue"].mean()
    LargeValueReturn = df.risk_premium.loc[df["HML"] == "LValue"].mean()

    # Return SMB based on definition
    HML = SmallValueReturn - LargeValueReturn
    print(round(HML, 4))
    return round(HML, 4)


In [45]:
df = df.assign(RM_RF=lambda x: (x['risk_premium'] - x['macro_tbl'])/x['beta'])

In [46]:
df.groupby('DATE')['RM_RF'].mean()

DATE
1962-07-31   -5.441999
1962-08-31   -0.646829
1962-09-28   -7.485565
1962-10-31   -7.014487
1962-11-30    6.929655
                ...   
2021-07-30   -0.733862
2021-08-31    4.917440
2021-09-30    1.309428
2021-10-29    8.932851
2021-11-30   -2.861788
Name: RM_RF, Length: 713, dtype: float64

In [73]:
dateList = list(df['DATE'].unique())

FFA = pd.DataFrame(columns=
    ['Date',
     'HML',
     'SMB',
     'MktPrem',
     'Rf',
     'Equity_Premium',
     ]
)

FFAIndex = 0
for i in dateList:
    FFA.loc[FFAIndex] = [i,
        calcHML(df.loc[df['DATE'] == i]),
        calcSMB(df.loc[df['DATE'] == i]),
        df.loc[df['DATE'] == i]['RM_RF'].mean(),
        df.loc[df['DATE'] == i]['macro_tbl'].mean(),
        df.loc[df['DATE'] == i]['risk_premium'].mean(),

    ]
    FFAIndex += 1

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
-2.3033
4.8829
-3.0822
-2.6332
-1.755
-1.5756
-4.6442
0.7038
-2.2953
2.0675
1.4554
1.1627
2.7908
-1.2658
0.0294
-0.8295
-0.7222
0.7571
-0.1468
-0.067
-0.8816
-0.3119
0.6605
-1.3638
-0.841
1.935
-1.9976
0.8379
-4.8326
3.5556
-1.7523
-2.1131
0.6991
0.858
0.5614
1.2672
1.197
0.2726
-1.2462
1.337
-0.6108
-0.0489
-1.2145
2.7357
2.4648
-0.1965
2.651
0.2467
-0.6773
1.9117
-1.2104
3.1252
-1.6358
2.5378
-2.8782
3.5282
0.2042
-0.3653
2.4904
-4.358
-2.0371
2.2576
0.6667
2.7798
-1.0242
3.0104
-2.5861
3.02
-1.2444
3.7267
-3.5149
4.0571
-4.1622
4.9321
-2.3583
2.8013
3.346
-0.8921
-1.3881
1.0934
3.6703
-2.9759
-2.0959
-0.27
0.3641
0.273
0.7454
-2.6893
0.1909
-0.6598
2.6998
-5.7726
3.1866
2.3266
0.8106
-1.1766
-5.7349
12.1411
0.3556
4.9311
-0.4824
2.3851
-0.4286
1.5017
-1.413
5.3374
-4.0852
10.1701
-4.2983
5.3456
-1.9072
1.8797
-0.5439
7.4501
-0.1328
4.9977
2.1884
-1.2615
-1.8028
8.4361
-7.2059
12.5313
-0.4339
-3.2271
-0.018
-1.3842
-1.0922
7.5987
-2.861

In [74]:
FFA

Unnamed: 0,Date,HML,SMB,MktPrem,Rf,Equity_Premium
0,1962-07-31,,,-5.441999,0.0292,-6.781600
1,1962-08-31,,,-0.646829,0.0282,-0.769200
2,1962-09-28,,,-7.485565,0.0278,-8.740000
3,1962-10-31,,,-7.014487,0.0274,-8.393800
4,1962-11-30,,,6.929655,0.0283,8.418200
...,...,...,...,...,...,...
708,2021-07-30,-0.1612,-5.4062,-0.733862,0.0005,-3.285524
709,2021-08-31,0.2147,0.5585,4.917440,0.0005,1.918121
710,2021-09-30,-3.3684,1.4522,1.309428,0.0004,-2.473316
711,2021-10-29,0.7635,-4.9848,8.932851,0.0005,2.907306


In [76]:
FFA = FFA.interpolate(limit_direction='backward')



In [77]:
np.any(np.isfinite(FFA))

True

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(FFA, test_size=0.25, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(FFA.iloc[:,:4], FFA['Equity_Premium'], test_size=0.20, random_state=42)

In [79]:
FFA

Unnamed: 0,HML,SMB,MktPrem,Rf,Equity_Premium
0,-2.3033,4.8829,-5.441999,0.0292,-6.781600
1,-2.3033,4.8829,-0.646829,0.0282,-0.769200
2,-2.3033,4.8829,-7.485565,0.0278,-8.740000
3,-2.3033,4.8829,-7.014487,0.0274,-8.393800
4,-2.3033,4.8829,6.929655,0.0283,8.418200
...,...,...,...,...,...
708,-0.1612,-5.4062,-0.733862,0.0005,-3.285524
709,0.2147,0.5585,4.917440,0.0005,1.918121
710,-3.3684,1.4522,1.309428,0.0004,-2.473316
711,0.7635,-4.9848,8.932851,0.0005,2.907306


In [80]:
from sklearn.linear_model import LinearRegression


lm = LinearRegression()


In [81]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

cv = KFold(n_splits=5, shuffle=True, random_state=45)
r2 = make_scorer(r2_score)
r2_val_score = cross_val_score(lm, X_train, y_train, cv=cv, scoring=r2)
scores=[r2_val_score.mean()]
scores



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\linear_model\_base.py", line 609, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\utils\validation.py", line 1064, in check_array
    _assert_all_finite(
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\utils\validation.py", line 123, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\drebi\miniconda3\envs\statclass\Lib\site-packages\sklearn\utils\validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains infinity or a value too large for dtype('float64').
