In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix
from tqdm.notebook import tqdm
from scipy import stats

from autogluon.tabular import TabularDataset, TabularPredictor

from IPython.display import clear_output
from sklearn.utils import class_weight

from autogluon.features.generators import AutoMLPipelineFeatureGenerator


from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

## Read data

In [2]:
train = pd.read_csv("data/train_dataset_train.csv")
test = pd.read_csv("data/test_dataset_test.csv")
submission = pd.read_csv("data/sample_submission.csv")

train.shape, test.shape, submission.shape

((13584, 24), (6691, 23), (6691, 2))

## Preprocessing

In [3]:
%%time
from preprocessing import preprocessing

train = pd.read_csv("data/train_dataset_train.csv")
test = pd.read_csv("data/test_dataset_test.csv")
print(train.shape, test.shape)
train_id = train["ID"]
test_id = test["ID"]

full_data = pd.concat([train.copy(), test.copy()]).reset_index(drop=True)
full_data = preprocessing(full_data.copy())
train, test = full_data[full_data["ID"].isin(train_id)], full_data[full_data["ID"].isin(test_id)]

print(train.shape, test.shape)

(13584, 24) (6691, 23)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3011/3011 [00:16<00:00, 188.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2025/2025 [00:10<00:00, 201.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2740/2740 [00:05<00:00, 507.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1490/1490 [00:07<00:00, 203.35it/s]

(13580, 26) (6691, 26)
CPU times: user 40.2 s, sys: 177 ms, total: 40.4 s
Wall time: 39.9 s





In [4]:
for col_name, col_dtype in train.dtypes.to_dict().items():
    if col_dtype == "object":
        train[col_name] = train[col_name].astype("category")
        test[col_name] = test[col_name].astype("category")

In [5]:
# sample weights
classes = np.array([-1.,  3.,  4.])
cw = class_weight.compute_class_weight("balanced", classes=classes, y=train["Статус"])
print(cw)
# {'c1': 11.806451416671381, 'c2': 10.196222523950034, 'c3': 2.4239850749887486}
train["sample_weight"] = train["Статус"].map(dict(zip(classes, cw))).values

[7.37242128 0.95944609 0.54881992]


In [6]:
inf = pd.concat([train.dtypes, train.nunique(), train.isna().sum(), test.nunique(), test.isna().sum()], axis=1)
inf.columns = ["dtypes", "TrainUnique", "TrainNans", "TestUnique", "TestNans"]
inf

Unnamed: 0,dtypes,TrainUnique,TrainNans,TestUnique,TestNans
ID,int64,13580,0,6691.0,0.0
Код_группы,int64,3651,0,2677.0,0.0
Год_Поступления,int64,17,0,14.0,0.0
Пол,category,2,7,2.0,3.0
Основания,category,5,0,5.0,0.0
Изучаемый_Язык,category,4,825,5.0,396.0
Дата_Рождения,int64,52,0,52.0,0.0
Уч_Заведение,category,436,3639,423.0,1763.0
Где_Находится_УЗ,category,879,1293,595.0,606.0
Год_Окончания_УЗ,float64,42,1298,42.0,617.0


## Split

In [7]:
# split
df_train, df_test = train_test_split(train, random_state=42, stratify=train["Статус"])

print(df_train.shape, df_test.shape)

(10185, 27) (3395, 27)


In [8]:
%%time
# train model
predictor = TabularPredictor(label='Статус', problem_type="multiclass",
                             eval_metric="f1_macro", verbosity=0,
                            sample_weight="sample_weight", weight_evaluation=False,)
predictor.fit(train_data=df_train, time_limit=None, presets="best_quality",
              hyperparameter_tune_kwargs={'num_trials': 100}, )

# metric
pred = predictor.predict(df_test)
f1 = f1_score(df_test["Статус"], pred, average='macro', zero_division = 0)
clear_output()
print(f"F1: {f1}")

F1: 0.7773026454523474
CPU times: user 5.75 s, sys: 192 ms, total: 5.94 s
Wall time: 37.8 s


In [9]:
confusion_matrix(df_test["Статус"], pred, labels=[4, 3, -1], normalize="true").round(3)

array([[0.964, 0.034, 0.002],
       [0.152, 0.794, 0.054],
       [0.104, 0.338, 0.558]])

## Full model

In [10]:
%%time
# train automl
predictor = TabularPredictor(label='Статус', problem_type="multiclass",
                             eval_metric="f1_macro", verbosity=0,
                             sample_weight="sample_weight", weight_evaluation=False,
                            )
predictor.fit(train_data=train, time_limit=None, presets='best_quality',
              hyperparameter_tune_kwargs={'num_trials': 100},)
preds = predictor.predict(test).astype(int).values
clear_output()

CPU times: user 16.1 s, sys: 117 ms, total: 16.2 s
Wall time: 32.4 s


In [11]:
print(predictor.get_model_best())
predictor.leaderboard(train, silent=True)

WeightedEnsemble_L3


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMLarge_BAG_L1,0.993823,0.788373,1.435565,1.146141,12.408465,1.435565,1.146141,12.408465,1,True,1
1,WeightedEnsemble_L2,0.993823,0.788373,1.437189,1.148874,12.412447,0.001625,0.002733,0.003982,2,True,2
2,LightGBMLarge_BAG_L2,0.923321,0.790703,2.205524,1.569319,26.029482,0.76996,0.423178,13.621017,2,True,3
3,WeightedEnsemble_L3,0.923321,0.790703,2.206711,1.571971,26.033446,0.001187,0.002652,0.003964,3,True,4


In [12]:
submission["Статус"] = preds
submission.to_csv("submission.csv", index=False)

display(train["Статус"].value_counts(normalize=True))
display(submission["Статус"].value_counts(normalize=True))

 4.0    0.607364
 3.0    0.347423
-1.0    0.045214
Name: Статус, dtype: float64

 4    0.638171
 3    0.314452
-1    0.047377
Name: Статус, dtype: float64

In [13]:
%%time
predictor.feature_importance(train).round(4)

CPU times: user 17min 41s, sys: 977 ms, total: 17min 42s
Wall time: 1min 31s


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Код_группы,0.4548,0.0146,0.0,5,0.4848,0.4247
Основания,0.1433,0.0073,0.0,5,0.1584,0.1283
ID,0.0911,0.0072,0.0,5,0.1058,0.0763
Год_Поступления,0.0526,0.0028,0.0,5,0.0584,0.0467
СрБаллАттестата,0.0477,0.0089,0.0001,5,0.0661,0.0293
КодФакультета,0.0444,0.0049,0.0,5,0.0546,0.0342
Возраст_Окончания_УЗ,0.0372,0.0078,0.0002,5,0.0533,0.0211
Уч_Заведение,0.0261,0.0064,0.0004,5,0.0394,0.0129
Город_ПП,0.0132,0.0025,0.0001,5,0.0183,0.008
Год_Окончания_УЗ,0.0121,0.0036,0.0008,5,0.0195,0.0048


## CrossVal

## Multiclass

In [14]:
%%time
n_splits = 4

preds = []
preds_prob = {-1.0:[], 3.0:[], 4.0:[]}
metrics = []


# FULL DATA
predictor = TabularPredictor(label='Статус', problem_type="multiclass",
                             eval_metric="f1_macro", verbosity=0,
                             sample_weight="sample_weight", weight_evaluation=False,
                            )
predictor.fit(train_data=train, time_limit=None, presets='best_quality',
              hyperparameter_tune_kwargs={'num_trials': 100},)
pred = predictor.predict(test).astype(int).values
preds.append(pred)

pred = predictor.predict_proba(test)
preds_prob[-1.0].append(pred.values[:, 0])
preds_prob[3.0].append(pred.values[:, 1])
preds_prob[4.0].append(pred.values[:, 2])


# SPLITS
cv = KFold(shuffle=True, random_state=42, n_splits=n_splits)

for train_idx, test_idx in tqdm(cv.split(train.copy(), groups=train["Статус"]), total=n_splits):
    train_batch, test_batch = train.iloc[train_idx].copy(), train.iloc[test_idx].copy()  

    # train automl
    predictor = TabularPredictor(label='Статус', problem_type="multiclass",
                                 eval_metric="f1_macro", verbosity=0,
                                 sample_weight="sample_weight", weight_evaluation=False,
                                )
    predictor.fit(train_data=train_batch, time_limit=None, presets='best_quality',
                  hyperparameter_tune_kwargs={'num_trials': 100},)

    # metrics
    f1 = f1_score(test_batch["Статус"], predictor.predict(test_batch), average='macro', zero_division = 0)
    f12 = predictor.evaluate(test_batch)["f1_macro"]

    # preds
    pred = predictor.predict_proba(test)
    preds_prob[-1.0].append(pred.values[:, 0])
    preds_prob[3.0].append(pred.values[:, 1])
    preds_prob[4.0].append(pred.values[:, 2])

    preds.append(predictor.predict(test).astype(int).values)
    metrics.append(f1)

    clear_output()
    print(f"F1: {round(f1, 3)}, {round(np.mean(metrics), 3)}")
    display(confusion_matrix(test_batch["Статус"], predictor.predict(test_batch), 
                             normalize="true", labels=[4, 3, -1]).round(3))
    print(f"{len(metrics)}: {metrics}")
        
    
    
#     break
    
print(np.mean(metrics))

F1: 0.78, 0.783


array([[0.953, 0.044, 0.002],
       [0.147, 0.783, 0.07 ],
       [0.094, 0.277, 0.629]])

4: [0.7827883862263049, 0.7779487096572856, 0.7907165028680306, 0.7801762127769049]
0.7829074528821315
CPU times: user 3min 46s, sys: 1.42 s, total: 3min 48s
Wall time: 3min 2s


In [15]:
res = stats.mode(np.stack(preds, axis=0))[0][0]
submission["Статус"] = res
submission.to_csv("submission.csv", index=False)

display(train["Статус"].value_counts(normalize=True))
display(submission["Статус"].value_counts(normalize=True))

 4.0    0.607364
 3.0    0.347423
-1.0    0.045214
Name: Статус, dtype: float64

 4    0.638320
 3    0.316545
-1    0.045135
Name: Статус, dtype: float64

In [16]:
a = np.stack(preds, axis=0)
changed, not_changed = [], []
final_preds = []
changes = {-1:[], 3:[], 4:[]}

for idx, p in enumerate(a.T):
    full_pred = p[0]
    ensemble_pred = stats.mode(p)[0][0]
    if full_pred != ensemble_pred:
        ens_conf = stats.mode(p)[1][0]
        if ens_conf >= 4:
            p_idx = ensemble_pred
            changed.append(idx)
            changes[full_pred].append(p_idx)
        else:
            p_idx = full_pred
            not_changed.append(idx)
    else:
        p_idx = p[0]
    
    final_preds.append(p_idx)
    
print(len(changed), len(not_changed))
print(f"-1: {len(changes[-1])}, 3: {len(changes[3])}, 4: {len(changes[4])}")


submission["Статус"] = final_preds
submission.to_csv("submission.csv", index=False)

display(train["Статус"].value_counts(normalize=True))
display(submission["Статус"].value_counts(normalize=True))

32 86
-1: 10, 3: 15, 4: 7


 4.0    0.607364
 3.0    0.347423
-1.0    0.045214
Name: Статус, dtype: float64

 4    0.638470
 3    0.314602
-1    0.046929
Name: Статус, dtype: float64