## Import Necessary Packages

In [1]:
import os
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import KBinsDiscretizer 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
names = ['soykeyword_metric1', 'soykeyword_metric2', 'soykeyword_metric3'] + ['baseline1', 'baseline2', 'baseline3', 'baseline4', 'baseline5', 'baseline6'] + ['ctfidf'] + ['rev_metric1', 'rev_metric2', 'rev_metric3']
dates = pd.date_range(start='2019-08-01', end='2022-02-28', freq='D').tolist()
for i in range(len(dates)):
    dates[i] = dates[i].strftime('%Y_%m_%d')

In [3]:
train_dates = pd.date_range(start='2019-08-01', end='2022-08-31', freq='D').tolist()
val_dates = pd.date_range(start='2022-09-01', end='2022-11-30', freq='D').tolist() 
test_dates = pd.date_range(start='2022-12-01', end='2023-02-28', freq='D').tolist() 

In [4]:
for i in range(len(train_dates)):
    train_dates[i] = train_dates[i].strftime('%Y_%m_%d')

for i in range(len(val_dates)):
    val_dates[i] = val_dates[i].strftime('%Y_%m_%d')

for i in range(len(test_dates)):
    test_dates[i] = test_dates[i].strftime('%Y_%m_%d')

In [5]:
dates_split = {"train":train_dates, "val":val_dates, "test":test_dates}

In [9]:
print(dates_split['train'][0])

2019_08_01


In [6]:
datas = {}
Ns = range(5, 301, 5)
for name in names:
    datas[name] = {}
    mat = pd.read_csv('./TF-IDF_Kimgihu/frequency_matrix_{}.csv'.format(name), index_col=0)
    for split in ['train', 'val', 'test']:
        datas[name][split] = dict()
        for N in Ns:
            datas[name][split][N] = []
            for i in range(13, len(dates_split[split])):
                tmp = []
                for date in dates_split[split][i-13:i+1]:
                    tmp.append(mat.loc[date].values[:N])
                tmp = np.concatenate(tmp) # list tmp -> numpy tmp
                datas[name][split][N].append(tmp)
                   
            datas[name][split][N] = np.array(datas[name][split][N])


KeyError: '2022_03_29'

In [8]:
t = pd.read_csv('./TF-IDF_Kimgihu/key_dates_US.csv', index_col='date')['label']
labels = {}
for split in ['train', 'val', 'test']:
    labels[split] = []
    for date in dates_split[split][13:]:
        if date not in t.index:
            labels[split].append(0)
        else:
            labels[split].append(t[date])

    labels[split] = np.array(labels[split])
print(labels['train'].shape, labels['val'].shape, labels['test'].shape)


(458,) (149,) (124,)


In [9]:
datasets = {}
for name in names:
    datasets[name] = {}
    for split in ['train', 'val', 'test']:
        datasets[name][split] = {}
        for N in Ns:
            datasets[name][split][N] = (datas[name][split][N], labels[split])


In [19]:
train_X, train_y = datasets['soykeyword_metric1']['train'][5]
print(train_X.shape)

(458, 5)


In [10]:
auc_results = {}
for N in Ns:
    auc_results[N] = {}
    for name in names:
        train_dataset = datasets[name]['train'][N]
        val_dataset = datasets[name]['val'][N]
        test_dataset = datasets[name]['test'][N]
        
        train_X, train_y = train_dataset
        val_X, val_y = val_dataset
        test_X, test_y = test_dataset

        pre = KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='uniform')
        train_X = pre.fit_transform(train_X)
        val_X = pre.transform(val_X)
        test_X = pre.transform(test_X)

        clf = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=-1, class_weight='balanced_subsample', random_state=0)

        clf.fit(train_X, train_y)
        val_pred = clf.predict_proba(val_X)[:,1]
        test_pred = clf.predict_proba(test_X)[:,1]

        val_auc = roc_auc_score(val_y, val_pred)
        test_auc = roc_auc_score(test_y, test_pred)
        auc_results[N][name] = (val_auc, test_auc)



In [15]:
# the AUC of validation dataset
for name in names:
    print(name, end=' ')
    for N in Ns:
        print("%.2f"%auc_results[N][name][0], end=' ')
    print()

soykeyword_metric1 0.52 0.56 0.56 0.55 0.55 0.56 0.56 0.55 0.57 0.57 0.57 0.58 0.61 0.64 0.60 0.61 0.57 0.46 0.47 0.62 0.62 0.51 0.61 0.62 0.61 0.63 0.61 0.60 0.61 0.62 0.59 0.62 0.58 0.61 0.59 0.56 0.58 0.50 0.56 0.41 0.60 0.46 0.56 0.55 0.52 0.55 0.47 0.52 0.50 0.56 0.49 0.54 0.48 0.64 0.63 0.49 0.65 0.63 0.59 0.61 
soykeyword_metric2 0.53 0.56 0.54 0.50 0.51 0.56 0.57 0.54 0.58 0.54 0.51 0.50 0.42 0.49 0.49 0.59 0.52 0.59 0.59 0.70 0.69 0.67 0.73 0.63 0.72 0.69 0.76 0.65 0.73 0.68 0.75 0.74 0.71 0.69 0.71 0.69 0.71 0.70 0.67 0.68 0.69 0.68 0.68 0.65 0.64 0.65 0.67 0.68 0.65 0.64 0.61 0.62 0.71 0.65 0.63 0.54 0.65 0.69 0.55 0.64 
soykeyword_metric3 0.52 0.56 0.56 0.55 0.55 0.56 0.56 0.56 0.57 0.57 0.57 0.60 0.61 0.63 0.59 0.61 0.58 0.61 0.47 0.50 0.61 0.64 0.63 0.62 0.58 0.63 0.49 0.57 0.60 0.55 0.58 0.56 0.61 0.56 0.55 0.62 0.58 0.61 0.61 0.54 0.49 0.43 0.53 0.56 0.59 0.58 0.50 0.45 0.56 0.57 0.45 0.61 0.54 0.58 0.56 0.57 0.57 0.54 0.59 0.59 
baseline1 0.49 0.71 0.56 0.65 0.62 0.61 

In [16]:
# the AUC of test dataset
for name in names:
    print(name, end=' ')
    for N in Ns:
        print("%.2f"%auc_results[N][name][1], end=' ')
    print()

soykeyword_metric1 0.49 0.48 0.48 0.48 0.48 0.46 0.46 0.44 0.45 0.45 0.45 0.45 0.45 0.44 0.45 0.43 0.41 0.41 0.43 0.43 0.45 0.47 0.43 0.43 0.46 0.46 0.47 0.43 0.46 0.50 0.50 0.47 0.47 0.46 0.48 0.48 0.47 0.50 0.52 0.02 0.03 0.17 0.01 0.07 0.10 0.07 0.11 0.11 0.07 0.08 0.07 0.12 0.10 0.03 0.24 0.03 0.25 0.04 0.05 0.04 
soykeyword_metric2 0.49 0.47 0.46 0.47 0.47 0.50 0.50 0.50 0.48 0.47 0.46 0.46 0.50 0.50 0.50 0.01 0.02 0.02 0.05 0.93 0.91 0.91 0.99 0.98 0.96 0.92 0.98 0.83 0.96 0.88 0.95 0.80 0.95 0.89 0.96 0.89 0.84 0.84 0.81 0.88 0.77 0.78 0.90 0.85 0.74 0.83 0.85 0.79 0.94 0.85 0.84 0.90 0.96 0.78 0.90 0.83 0.91 0.87 0.89 0.98 
soykeyword_metric3 0.49 0.48 0.48 0.48 0.48 0.46 0.46 0.44 0.45 0.45 0.45 0.46 0.45 0.43 0.43 0.43 0.43 0.43 0.43 0.45 0.43 0.47 0.45 0.42 0.44 0.44 0.46 0.43 0.45 0.48 0.50 0.49 0.46 0.46 0.48 0.47 0.50 0.46 0.52 0.02 0.07 0.07 0.02 0.19 0.10 0.07 0.04 0.12 0.05 0.02 0.03 0.03 0.07 0.02 0.07 0.09 0.18 0.10 0.22 0.08 
baseline1 0.66 0.46 0.56 0.30 0.59 0.31 