In [None]:
!pip install numerapi
import numerapi
NAPI = numerapi.NumerAPI(verbosity="info")
import numpy as np
import random as rn
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
import os

# Data directory
DIR = "kaggle/working"
#def download_current_data(directory: str):
#        """
#        現在のラウンドのデータをダウンロードします
#        ：param directory：データを保存する必要があるディレクトリへのパス
#        """
#        current_round = NAPI.get_current_round()
#        if os.path.isdir(f'{directory}/numerai_dataset_{current_round}/'):
#            print(f"You already have the newest data! Current round is: {current_round}")
#        else:
#            print(f"Downloading new data for round: {current_round}!")
#            NAPI.download_current_dataset(dest_path=directory, unzip=True)

def load_data(directory: str, reduce_memory: bool=True) -> tuple:
        """
          現在のラウンドのデータを取得する
         ：param directory：データを保存する必要があるディレクトリへのパス
         ：return：データセットを含むタプル
        """
        print('------------------------------------------Loading the data')
        full_path = f'{directory}/numerai_dataset_{NAPI.get_current_round()}/'
        train_path = full_path + 'numerai_training_data.csv'
        test_path = full_path + 'numerai_tournament_data.csv'
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        print('------------------------------------------End of loading the data')
        # Reduce all features to 32-bit floats
        if reduce_memory:
            num_features = [f for f in train.columns if f.startswith("feature")]
            train[num_features] = train[num_features].astype(np.float32)
            test[num_features] = test[num_features].astype(np.float32)
        # numerai_tournament_dataにはラベルが与えられているValidationデータと与えられてないテストデータがある
        # validation split
        # valid.loc[valid["era"] > 180, "valid2"] = True # むずいやつ
        # valid.loc[valid["era"] <= 180, "valid2"] = False # 簡単なやつ
        val = test[test['data_type'] == 'validation']
        test = test[test['data_type'] != 'validation']
        print('------------------------------------------END')
        return train, val, test
    
    
# Download, unzip and load data
# download_current_data(DIR)
# train, val, test = load_data(DIR, reduce_memory=True)


def sharpe_ratio(corrs: pd.Series) -> np.float32:
        """
        グループ化された時代ごとのデータを使用して、ヌメライのシャープレシオを計算します

         ：param corrs：各時代のスピアマンの相関係数を含むパンダシリーズ
         ：return：予測のシャープレシオを示すフロート。
        """
        return corrs.mean() / corrs.std()


def evaluate(df: pd.DataFrame) -> tuple:
        """
        ヌメライに関連する指標を評価して表示する

         ：param df：「era」、「target_kazutsugi」の列と予測用の列を含むPandasDataFrame
         ：param pred_col：予測が保存される列
         ：return：メトリックを含むfloatのタプル
        """
        def _score(sub_df: pd.DataFrame) -> np.float32:
            """Calculates Spearman correlation"""
            return spearmanr(sub_df["target"], sub_df["prediction"])[0]

        # Calculate metrics
        corrs = df.groupby("era").apply(_score)
        print(corrs)
        payout_raw = (corrs / 0.2).clip(-1, 1)
        spearman = round(corrs.mean(), 4)

        payout = round(payout_raw.mean(), 4)
        numerai_sharpe = round(sharpe_ratio(corrs), 4)
        mae = mean_absolute_error(df["target"], df["prediction"]).round(4)

        # Display metrics
        print(f"Spearman Correlation: {spearman}")
        print(f"Average Payout: {payout}")
        print(f"Sharpe Ratio: {numerai_sharpe}")
        print(f"Mean Absolute Error (MAE): {mae}")
        return spearman, payout, numerai_sharpe, mae

def neutralize(series,by, proportion):
    
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)
    exposures = np.hstack((exposures, np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
    correction = proportion * (exposures.dot(np.linalg.lstsq(exposures, scores)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    
    return neutralized





In [None]:
train, val, test = load_data(DIR, reduce_memory=True)

------------------------------------------Loading the data
------------------------------------------End of loading the data
------------------------------------------END


In [None]:
train.head()

In [None]:
def get_group_stats(df: pd.DataFrame) -> pd.DataFrame:
        """
        各グループの統計モーメントを計算して、特徴を作成します。

         ：param df：すべての機能を含むPandas DataFrame
        """
        for group in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]:
            cols = [col for col in df.columns if group in col]
            df[f"feature_{group}_mean"] = df[cols].mean(axis=1)
            df[f"feature_{group}_std"] = df[cols].std(axis=1)
            df[f"feature_{group}_skew"] = df[cols].skew(axis=1)
        return df

    # Add group statistics features


train = get_group_stats(train)
val = get_group_stats(val)
test = get_group_stats(test)
print('------------------------------------------END')

In [None]:
from sklearn import preprocessing
interactions = preprocessing.StandardScaler()
#ft_corr_list=['feature_dexterity7', 'feature_charisma18', 'feature_charisma63', 'feature_dexterity14']#ft_corr_listは交互作用特徴量を作りたいものを入れる。
ft_corr_list=['feature_constitution96', 'feature_wisdom32', 'feature_constitution32', 'feature_strength14', 'feature_intelligence3', 'feature_dexterity7']
interactions.fit(train[ft_corr_list], train["target"])
X_train_interact = pd.DataFrame(interactions.transform(train[ft_corr_list]))
X_best_val_inter =pd.DataFrame(interactions.transform(val[ft_corr_list]))
X_best_test_inter =pd.DataFrame(interactions.transform(test[ft_corr_list]))

train=train.reset_index().drop(columns='index')
train=pd.concat([train,X_train_interact],axis=1)

val=val.reset_index().drop(columns='index')
val=pd.concat([val,X_best_val_inter],axis=1)

test=test.reset_index().drop(columns='index')
test=pd.concat([test,X_best_test_inter],axis=1)
print('------------------------------------------END')

In [None]:
feature_list=train.columns.drop('id').drop('era').drop('data_type').drop('target')

In [None]:
feature_list=['feature_intelligence_mean', 'feature_intelligence_std', 'feature_intelligence_skew', 'feature_wisdom_mean', 'feature_wisdom_std', 'feature_wisdom_skew', 
              'feature_charisma_mean', 'feature_charisma_std', 'feature_charisma_skew', 'feature_dexterity_mean', 'feature_dexterity_std', 'feature_dexterity_skew', 
              'feature_strength_mean', 'feature_strength_std', 'feature_strength_skew','feature_constitution_mean', 'feature_constitution_std', 'feature_constitution_skew',
             'feature_constitution96','feature_wisdom32','feature_constitution32','feature_strength14','feature_intelligence3','feature_dexterity7','0','1','2','3','4','5']

In [None]:
feature_list=['feature_constitution96','feature_wisdom32','feature_constitution32','feature_strength14','feature_intelligence3','feature_dexterity7']

In [None]:
print(feature_list)

In [None]:
#

#
dtrain = lgb.Dataset(train[feature_list].fillna(0), label=train["target"])
dvalid = lgb.Dataset(val[feature_list].fillna(0), label=val["target"])
print('------------------------------------------END')

In [None]:
best_config ={"objective": "regression", "metric": "l2", "verbosity": 10, "feature_pre_filter": False,
              "lambda_l1": 0.0163973329416619, "lambda_l2": 6.592372824860872e-08, "num_leaves": 31,
              "feature_fraction": 1.0, "bagging_fraction": 1.0, "bagging_freq": 0, "min_child_samples": 20,'num_iterations': 1000,
              "learning_rate":0.01,"n_estimators":1750,"max_depth":4, "random_state": 0} 

In [None]:
best_config ={'objective': 'regression', 'metric': 'l2', 'verbosity': 10, 'feature_pre_filter': False, 
              'lambda_l1': 0.0163973329416619, 'lambda_l2': 6.592372824860872e-08,
              'num_leaves': 31,
              'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20, 'num_iterations': 1000, 'early_stopping_round': 100}

In [None]:
best_config ={"objective":"regression", "num_leaves":31,"learning_rate":0.01,"n_estimators":1750,"max_depth":4,"metric":"mse","verbosity": 10, "random_state": 0} 

In [None]:
best_config ={"objective":"regression", "num_leaves":31,"learning_rate":0.01,"n_estimators":3389,"max_depth":2,"metric":"mse","verbosity": 10, "random_state": 0} 

In [None]:
best_config ={"objective":"regression", "learning_rate":0.01,"max_depth":5,'boosting_type': 'gbdt','feature_fraction': 0.1,'seed': 42} 

In [None]:
# 通常のLGB
#best_config ={"objective":"regression", "num_leaves":31,"learning_rate":0.01,"n_estimators":2000,"max_depth":5,"metric":"mse","verbosity": 10, "random_state": 0} 
#

model = lgb.train(best_config, dtrain)
train.loc[:, "prediction"] = model.predict(train[feature_list])
val.loc[:,"prediction"]=val["target"]
val.loc[:,"prediction"] = model.predict(val[feature_list])
print('------------------------------------------END')

In [None]:
model = lgb.LGBMRegressor(**best_config)
model.fit(train[feature_list],train["target"])
train.loc[:, "prediction"] = model.predict(train[feature_list])
val.loc[:,"prediction"]=val["target"]
val.loc[:,"prediction"] = model.predict(val[feature_list])
print('------------------------------------------END')

------------------------------------------END


In [None]:
# ハイパーパラメータ最適化
import optuna
def opt(trialO):
    n_estimators = trialO.suggest_int('n_estimators', 500, 4000)
    max_depth = trialO.suggest_int('max_depth', 1, 20)
    #min_child_weight = trialO.suggest_int('min_child_weight', 1, 20)
    #subsample = trialO.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)
    #colsample_bytree = trialO.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1)
    model_opt = lgb.LGBMRegressor(
        #random_state=42,
        random_state=0,
        num_boost_round=1000,
        n_estimators = n_estimators,
        max_depth = max_depth,
        #min_child_weight = min_child_weight,
        #subsample = subsample,
        #colsample_bytree = colsample_bytree,
        learning_rate=0.01,
        metric="mse",
        verbosity=10
        
    )
    model_opt.fit(train[feature_list], train["target"])
    opt_pred = model_opt.predict(val[feature_list])
    return (1.0 - (model_opt.score(val[feature_list], val["target"])))

model_opt=lgb.LGBMRegressor()
study = optuna.create_study()
study.optimize(opt, n_trials=100)
print(study.best_params)
print(1-study.best_value)
print('------------------------------------------END')

In [None]:
# ハイパーパラメータ最適化
import optuna.integration.lightgbm as lgb_optuna

param = {
        'objective': 'regression'
        'metric': 'mse',
        'verbosity': 10,
    }

best = lgb_optuna.train(param, 
                 dtrain,
                 valid_sets=dvalid,
                 early_stopping_rounds=100)

In [None]:
print(best.params)

In [None]:
print(best.best_score)

In [None]:
feature_spearman_val = [spearmanr(val["prediction"], val[f])[0] for f in feature_list]
feature_exposure_val = np.std(feature_spearman_val).round(4)
spearman, payout, numerai_sharpe, mae = evaluate(val)

era
era121    0.025718
era122    0.030021
era123    0.054352
era124    0.067290
era125    0.051013
era126    0.025927
era127   -0.020346
era128    0.064889
era129    0.001306
era130    0.043659
era131    0.022118
era132    0.067764
era197    0.024853
era198   -0.004059
era199   -0.042764
era200    0.006934
era201   -0.021094
era202    0.036511
era203    0.013599
era204    0.005637
era205   -0.010685
era206   -0.005159
era207    0.066488
era208    0.036663
era209    0.051599
era210   -0.010853
era211   -0.050050
era212    0.010810
dtype: float64
Spearman Correlation: 0.0194
Average Payout: 0.0968
Sharpe Ratio: 0.5883
Mean Absolute Error (MAE): 0.1507


In [None]:
train=0#メモリ削減
import time 
time.sleep(5)
val[['id', "prediction"]].to_csv("submission_val.csv", index=False)
val=0#メモリ削減
time.sleep(5)
test.loc[:, "prediction"] =0
test.loc[:, "prediction"] = model.predict(test[feature_list])
test[['id', "prediction"]].to_csv("submission_test.csv", index=False)
test=0#メモリ削減
time.sleep(5)
print('------------------------------------------END')

In [None]:
directory = "kaggle/working"
full_path = f'{directory}/numerai_dataset_{NAPI.get_current_round()}/'
test_path = full_path + 'numerai_tournament_data.csv'
tournament_data = pd.read_csv(test_path)
tournament_data_id=tournament_data['id']
tournament_data_id2=tournament_data['feature_dexterity7']


In [None]:
tournament_data_id2.head(15)

In [None]:
tournament_data_id=pd.concat([tournament_data_id,tournament_data_id2],axis=1)

In [None]:
tournament_data_id.head(15)

In [None]:
val=pd.read_csv("submission_val.csv")
test=pd.read_csv("submission_test.csv")

In [None]:
test_val_concat=pd.concat([val[['id', "prediction"]],test[['id', "prediction"]]],axis=0).set_index('id')
tournament_data_id=tournament_data_id.set_index('id')


In [None]:
tournament_data_id.head(15)

In [None]:
conc_submit=pd.concat([tournament_data_id,test_val_concat],axis=1).drop(columns='feature_dexterity7').reset_index()
conc_submit=conc_submit.rename(columns={'index': 'id'})
conc_submit.to_csv("submission_file"+".csv", index=False)

In [None]:
by=pd.read_csv('kaggle/working/numerai_dataset_'+str(NAPI.get_current_round())+'/example_predictions.csv')
neut=pd.read_csv("submission_file.csv")
neut=pd.DataFrame({'prediction':neutralize(neut['prediction'],by['prediction'], 0.3)})#ここを弄ると、Neutralizeの量を変化させることができる。
conc=pd.concat([by.drop(columns="prediction"),neut],axis=1)
conc.to_csv("neutralized_submission_file"+ now.strftime('%Y%m%d_%H%M%S') + ".csv", index=False)#提出ファイル

print('------------------------------------------END')

In [None]:
# Get your API keys and model_id from https://numer.ai/submit
#public_id = ""
#secret_key = ""
#model_id = ""
#napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)
#submission_id = napi.upload_predictions("neutralized_submission_file.csv", model_id=model_id)

In [None]:
from google.colab import files
files.download('neutralized_submission_file.csv')

In [None]:
ls kaggle/working/numerai_dataset_249/

In [None]:
from sklearn import preprocessing
interactions = preprocessing.StandardScaler()
ft_corr_list=['feature_dexterity7', 'feature_charisma18', 'feature_charisma63', 'feature_dexterity14']#ft_corr_listは交互作用特徴量を作りたいものを入れる。
interactions.fit(train[ft_corr_list], train["target"])

In [None]:
ls