In [1]:
import pandas as pd

# ファイルの読み込み（アップロード後にパスを確認）
df = pd.read_csv("merged_compustat_crsp.csv")

# 日付列の変換（例: 'datadate'列がある場合）
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [2]:
# 除外するカラム一覧
drop_columns = [
    'datadate', 'gvkey', 'cusip', 'cusip8', 'cik',
    'PERMNO', 'PERMCO', 'CUSIP', 'HdrCUSIP',
    'Ticker', 'TradingSymbol', 'MthCalDt', 'sprtrn',
    'SICCD', 'NAICS', 'date'
]

# 特徴量 (X) と 目的変数 (y) に分割
X = df.drop(columns=drop_columns + ['MthRet']).fillna(0)  # MthRet 以外を特徴量に
y = df['MthRet']  # 月次リターンを目的変数に

In [7]:
# yがNaNでない行のみを使うマスクを作成
valid_mask = ~y.isna()

# Xとyをそのマスクでフィルタ
X = X[valid_mask]
y = y[valid_mask]

df = df[valid_mask]

In [8]:
# それぞれのマスクを作成
train_mask = df['date'] < '2017-01-01'
val_mask   = (df['date'] >= '2017-01-01') & (df['date'] < '2021-01-01')
test_mask  = df['date'] >= '2021-01-01'

# 各データセットを分割
X_train = X[train_mask]
y_train = y[train_mask]

X_val = X[val_mask]
y_val = y[val_mask]

X_test = X[test_mask]
y_test = y[test_mask]

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [10]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [11]:
y_val_pred = lr_model.predict(X_val)

In [12]:
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"Validation RMSE: {rmse:.5f}")
print(f"Validation R²: {r2:.5f}")

Validation RMSE: 0.12648
Validation R²: 0.14559


In [13]:
from sklearn.neural_network import MLPRegressor

In [15]:
mlp = MLPRegressor(
    hidden_layer_sizes=(64, 32),  # 隠れ層の構造（例）
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
)

mlp.fit(X_train, y_train)

In [16]:
y_val_pred = mlp.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"MLP Validation RMSE: {rmse:.5f}")
print(f"MLP Validation R²: {r2:.5f}")

MLP Validation RMSE: 0.13921
MLP Validation R²: -0.03505


In [17]:
mlp = MLPRegressor(
    hidden_layer_sizes=(32, 16),
    activation='relu',
    solver='adam',
    max_iter=1000,
    early_stopping=True,
    random_state=42
)
mlp.fit(X_train, y_train)

In [18]:
y_val_pred = mlp.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"MLP Validation RMSE: {rmse:.5f}")
print(f"MLP Validation R²: {r2:.5f}")

MLP Validation RMSE: 0.14743
MLP Validation R²: -0.16075


In [23]:
mlp = MLPRegressor(
    hidden_layer_sizes=(16, 8),
    activation='relu',
    solver='adam',
    alpha=0.01,
    max_iter=1000,
    early_stopping=True,
    random_state=42
)
mlp.fit(X_train, y_train)

In [24]:
y_val_pred = mlp.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"MLP Validation RMSE: {rmse:.5f}")
print(f"MLP Validation R²: {r2:.5f}")

MLP Validation RMSE: 0.13229
MLP Validation R²: 0.06530


In [22]:
X_train

Unnamed: 0,fqtr,cstkey,cicurry,dilady,niy,nopiy,invchy,esubcy,apalchy,aolochy,...,o_score,prc_highprc_252d,qmj,qmj_growth,resff3_6_1,ret_12_1,ret_3_1,ret_6_1,sale_me,z_score
0,1.358545,0.101646,0.078191,-0.102176,1.952645,0.225753,-0.520806,0.037341,3.525858,0.017512,...,-0.344697,0.554630,0.518792,0.616769,0.893431,0.865238,-0.388681,0.693485,0.750629,-0.260626
1,0.460614,0.101646,0.078191,-0.102176,-0.293872,-0.007937,0.048488,0.037341,0.056810,0.023267,...,-0.344697,0.554630,0.518792,0.616769,0.893431,0.865238,-0.388681,0.693485,0.750629,-0.260626
2,1.358545,0.101646,0.078191,-0.102176,-0.191674,-0.011185,-0.021842,0.037341,-0.011498,0.072934,...,-0.344697,0.554630,0.518792,0.616769,0.893431,0.865238,-0.388681,0.693485,0.750629,-0.260626
3,0.460614,0.101646,0.078191,-0.102176,-0.319465,-0.007781,0.049002,0.037341,-0.011498,0.023405,...,-0.344697,0.554630,0.518792,0.616769,0.893431,0.865238,-0.388681,0.693485,0.750629,-0.260626
4,-0.437317,0.101646,0.366522,-0.102176,-0.245127,-0.011185,0.064250,0.037341,0.102768,0.017188,...,-0.344697,0.554630,0.518792,0.616769,0.893431,0.865238,-0.388681,0.693485,0.750629,-0.260626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25815,0.460614,0.101646,0.098154,-0.102176,-0.563366,0.095700,0.081983,0.037341,-0.265970,0.002074,...,-0.068555,-0.514243,-0.975949,0.496225,0.648934,-1.069316,2.118806,0.088851,1.656021,-0.710144
25816,0.460614,0.101646,-0.056201,-0.102176,-0.144454,-0.011185,0.052075,0.295235,-0.027975,0.131331,...,-0.068555,-0.514243,-0.975949,0.496225,0.648934,-1.069316,2.118806,0.088851,1.656021,-0.710144
25817,1.358545,0.101646,-0.043984,-0.102176,-0.130552,0.012435,0.067077,0.037341,-0.011498,0.083313,...,-0.068555,-0.514243,-0.975949,0.496225,0.648934,-1.069316,2.118806,0.088851,1.656021,-0.710144
25818,0.460614,0.101646,-0.173489,-0.102176,-0.181898,-0.006756,0.057511,0.037341,-0.011498,0.042733,...,-0.068555,-0.514243,-0.975949,0.496225,0.648934,-1.069316,2.118806,0.088851,1.656021,-0.710144


In [25]:
mlp = MLPRegressor(
    hidden_layer_sizes=(16, 8),
    activation='relu',
    solver='adam',
    alpha=0.1,
    max_iter=1000,
    early_stopping=True,
    random_state=42
)
mlp.fit(X_train, y_train)
y_val_pred = mlp.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"MLP Validation RMSE: {rmse:.5f}")
print(f"MLP Validation R²: {r2:.5f}")

MLP Validation RMSE: 0.12658
MLP Validation R²: 0.14434
