In [68]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [69]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [104]:
X = train_df.drop('target', axis=1)
y = train_df['target']

In [105]:
X_test = test_df.drop('ID', axis=1)

In [72]:
from sklearn.preprocessing import MinMaxScaler

In [106]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [74]:
from sklearn.preprocessing import PolynomialFeatures

In [107]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
X_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))
X_test_poly = poly.transform(X_test_scaled)
X_test_poly = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out(X_test.columns))

In [76]:
from sklearn.model_selection import train_test_split

In [237]:
important_columns = ['col2 col16', 'col16 col18', 'col16 col21', 'col2 col18', 
                     'col13 col16', 'col2 col13', 'col18 col21', 'col13 col21',
                     'col16 col25', 'col3 col16', 'col1 col14'
                     ]
combined_data_2 = X_poly[important_columns]
combined_test_data_2 = X_test_poly[important_columns]

In [214]:
X_train, X_test, y_train, y_test = train_test_split(combined_data_2, y, test_size=0.1, random_state=101)

In [79]:
from catboost import CatBoostClassifier

In [80]:
from sklearn.model_selection import StratifiedKFold

In [175]:
cv = StratifiedKFold(6, shuffle=True, random_state=101)

In [176]:
folds = []

for train_index, val_index in cv.split(X_train, y_train):
    X_val = X_train.iloc[val_index]
    y_val = y_train.iloc[val_index]
    folds.append((X_val, y_val))


In [83]:
from sklearn.metrics import f1_score

In [222]:
model = CatBoostClassifier(
            loss_function='Logloss',
            iterations=660,
            learning_rate=0.01,
            custom_loss = ['F1', 'Accuracy']
)
model.fit(
    X_train, y_train,
    eval_set=folds,
    verbose=False,
    use_best_model=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2dc5150e5d0>

In [243]:
probabilities_train = model.predict_proba(X_train)[:, 1]
probabilities_test = model.predict_proba(X_test)[:, 1]

In [56]:
from sklearn.metrics import precision_recall_curve, confusion_matrix

In [231]:
precision, recall, thresholds = precision_recall_curve(y_test, probabilities_test)

In [232]:
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f"Best Threshold: {best_threshold}")

Best Threshold: 0.402124435588858


In [244]:
new_threshold = best_threshold
predictions_train = (probabilities_train > new_threshold).astype(int)
predictions_test = (probabilities_test > new_threshold).astype(int)

print(f"F1-score на обучающих данных: {f1_score(y_train, predictions_train)}")
print(f"F1-score на тестовых данных: {f1_score(y_test, predictions_test)}")
conf_matrix = confusion_matrix(y_test, predictions_test)
print(conf_matrix)

F1-score на обучающих данных: 0.7774599542334096


In [245]:
submission = pd.DataFrame({'ID': test_df['ID'], 'target': predictions_test.astype(int)})
submission.to_csv('submission.csv', index=False)