In [1]:
%cd ..

/home/nikita/edu/competitions/admet


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from catboost import CatBoostClassifier

from imblearn.over_sampling import RandomOverSampler

In [3]:
train_df = pd.read_csv("data/train_admet_pumped_with_all_new_features.csv")
test_df = pd.read_csv("data/test_data_with_all_new_features.csv")

In [4]:
feature_cols = train_df.iloc[:, 4:].columns

scaler = StandardScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
test_df[feature_cols] = scaler.transform(test_df[feature_cols])

In [5]:
properties = train_df.property.unique()

df_trains = [
    train_df[train_df.property == prop] for prop in properties
]

In [6]:
df_vals = []

for i, df_train in enumerate(df_trains):
    df_train, df_val = train_test_split(df_train, test_size=0.2, stratify=df_train.Y)

    df_vals.append(df_val)
    df_trains[i] = df_train

In [7]:
# for i, df_train in enumerate(df_trains):
#     sampler = RandomOverSampler()
#     df_train, _ = sampler.fit_resample(df_train, df_train.Y)
#     df_trains[i] = df_train

# df_train_pcas = []
# df_val_pcas = []
# for i, (df_train, df_val) in enumerate(zip(df_trains, df_vals)):
#     pca = PCA(n_components=0.8)

#     df_train_pca = pca.fit_transform(df_train[feature_cols])
#     df_val_pca = pca.transform(df_val[feature_cols])
#     df_train_pcas.append(df_train_pca)
#     df_val_pcas.append(df_val_pca)

In [9]:
models = []
for i in range(len(df_trains)):
    model = CatBoostClassifier(
        n_estimators=2000, eval_metric="AUC", use_best_model=True
    )
    model.fit(
        df_trains[i][feature_cols],
        df_trains[i].Y,
        eval_set=(df_vals[i][feature_cols], df_vals[i].Y),
    )

Learning rate set to 0.035383
0:	test: 0.7719565	best: 0.7719565 (0)	total: 209ms	remaining: 6m 57s
1:	test: 0.7925323	best: 0.7925323 (1)	total: 313ms	remaining: 5m 12s
2:	test: 0.8092280	best: 0.8092280 (2)	total: 414ms	remaining: 4m 35s
3:	test: 0.8165764	best: 0.8165764 (3)	total: 538ms	remaining: 4m 28s
4:	test: 0.8190153	best: 0.8190153 (4)	total: 657ms	remaining: 4m 22s
5:	test: 0.8204918	best: 0.8204918 (5)	total: 793ms	remaining: 4m 23s
6:	test: 0.8258963	best: 0.8258963 (6)	total: 923ms	remaining: 4m 22s
7:	test: 0.8266146	best: 0.8266146 (7)	total: 1.04s	remaining: 4m 18s
8:	test: 0.8292667	best: 0.8292667 (8)	total: 1.14s	remaining: 4m 11s
9:	test: 0.8303419	best: 0.8303419 (9)	total: 1.26s	remaining: 4m 10s
10:	test: 0.8334717	best: 0.8334717 (10)	total: 1.35s	remaining: 4m 4s
11:	test: 0.8335218	best: 0.8335218 (11)	total: 1.45s	remaining: 3m 59s
12:	test: 0.8346347	best: 0.8346347 (12)	total: 1.54s	remaining: 3m 55s
13:	test: 0.8350383	best: 0.8350383 (13)	total: 1.63s	r

In [38]:
models = []
for i, df_train in enumerate(df_trains):
    print(f"### Training model for property {properties[i]}")
    
    model = RandomForestClassifier(n_estimators=600)
    

    model.fit(df_train[feature_cols], df_train.Y)
    models.append(model)

### Training model for property 1
### Training model for property 2
### Training model for property 3


In [39]:
all_preds = []
for i, prop in enumerate(properties):
    preds = models[i].predict_proba(test_df[test_df["property"] == prop][feature_cols])[:, 1]
    all_preds.extend(preds)

In [42]:
sample = pd.read_csv("data/sample.csv")
sample["Y"] = all_preds
sample.to_csv("submissions/forest_and_lots_of_data_no_oversampling.csv", index=False)

In [43]:
sample

Unnamed: 0,id,Y
0,0,0.816667
1,1,0.953333
2,2,0.981667
3,3,0.953333
4,4,0.153333
...,...,...
1216,1216,0.003333
1217,1217,0.875000
1218,1218,0.991667
1219,1219,0.953333
