In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing, tree, metrics
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [2]:
data_X = pd.read_csv('ref_train_x.csv')
data_Y = pd.read_csv('ref_train_y.csv', header = None, names = ['Y'])
data = pd.concat([data_X, data_Y], axis = 1)
X_test = pd.read_csv('ref_test_x.csv')

In [3]:
from sklearn.impute import KNNImputer
def fill_na(df, knn=5):
    df_num = df[list(df.describe())]
    
    columns_full = list(df)
    columns_numerical = list(df_num)
    columns_categorical = [x for x in columns_full if x not in columns_numerical]
    df_categorical = df[columns_categorical]

    imputer = KNNImputer(n_neighbors=5)
    df_num_treated = imputer.fit_transform(df_num)
    df_num_treated = pd.DataFrame(df_num_treated, columns = columns_numerical)
    
    
    df_full_treated = pd.concat([df_num_treated, df_categorical], axis=1)[columns_full]
    return(df_full_treated)

In [4]:
data = data.drop('raw_id', axis = 1)
X_test = X_test.drop('raw_id', axis = 1)

In [5]:
le = preprocessing.LabelEncoder()
le_exchange = le.fit_transform(data['exchange'])
data['exchange'] = le_exchange

le_bis = preprocessing.LabelEncoder()
le_exchange_bis = le_bis.fit_transform(X_test['exchange'])
X_test['exchange'] = le_exchange_bis

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15249 entries, 0 to 15248
Data columns (total 23 columns):
sector                    15249 non-null int64
earnings_implied_obs      15229 non-null float64
delta_vol_1w              15239 non-null float64
delta_vol_1y              15021 non-null float64
return_1w                 15248 non-null float64
return_1m                 15248 non-null float64
return_1y                 15248 non-null float64
implied_vol_3m            15244 non-null float64
realised_vol_1w           15246 non-null float64
realised_vol_1m           15246 non-null float64
realised_vol_1y           15247 non-null float64
ratio_put_call            15154 non-null float64
publication_date_funda    15234 non-null float64
exchange                  15249 non-null int32
net_income                15232 non-null float64
shareholders_equity       15228 non-null float64
net_debt                  15229 non-null float64
ebitda                    13380 non-null float64
ebit         

In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4191 entries, 0 to 4190
Data columns (total 22 columns):
sector                    4191 non-null int64
earnings_implied_obs      4189 non-null float64
delta_vol_1w              4191 non-null float64
delta_vol_1y              4130 non-null float64
return_1w                 4191 non-null float64
return_1m                 4191 non-null float64
return_1y                 4191 non-null float64
implied_vol_3m            4191 non-null float64
realised_vol_1w           4191 non-null float64
realised_vol_1m           4191 non-null float64
realised_vol_1y           4191 non-null float64
ratio_put_call            4173 non-null float64
publication_date_funda    4179 non-null float64
exchange                  4191 non-null int32
net_income                4179 non-null float64
shareholders_equity       4178 non-null float64
net_debt                  4179 non-null float64
ebitda                    3612 non-null float64
ebit                      3612 no

In [8]:
X_train = data.drop('Y', axis = 1)
y_train = data['Y']

In [9]:
X_train = fill_na(X_train)
X_test = fill_na(X_test)

In [10]:
clf = tree.DecisionTreeClassifier(random_state = 42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [11]:
RF = RandomForestClassifier(n_jobs=-1, criterion= 'gini')
RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [12]:
LGBM = lgb.LGBMClassifier(n_jobs=-1, n_estimators = 10000)
LGBM.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=10000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [13]:
y_predict_real_LGBM = LGBM.predict_proba(X_test)[:,1]
y_predict_real_RF = RF.predict_proba(X_test)[:,1]
y_predict_real_CLF = clf.predict_proba(X_test)[:,1]

In [14]:
y_predict_real_LGBM

array([7.97396569e-02, 2.45309541e-09, 2.22201776e-07, ...,
       2.03976508e-03, 5.48714567e-03, 5.10276505e-06])

In [15]:
np.savetxt("out_LGBM.csv", y_predict_real_LGBM, delimiter = ".")
np.savetxt("out_RF.csv", y_predict_real_RF, delimiter = ".")
np.savetxt("out_CLF.csv", y_predict_real_CLF, delimiter = ".")

In [43]:
import xgboost
from imblearn.over_sampling import SMOTE

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [44]:
s =  SMOTE()
X_train, y_train = s.fit_resample(X_train, y_train)

In [45]:
xgb = xgboost.XGBRegressor(n_estimator = 200, eval_metric="auc", silent = True)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

  if getattr(data, 'base', None) is not None and \


In [46]:
np.savetxt("out_xgb.csv", y_pred, delimiter = ".")