In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from tqdm.notebook import tqdm
from itertools import combinations

In [67]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Using cached catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [68]:
from catboost import CatBoostClassifier

In [4]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,target
0,-6.130724,7.429628,3.651437,-1.950971,-3.384415,1.095934,-3.077774,-2.619091,5.128201,1.228476,...,0.392978,4.920467,16.61046,-2.430804,-1.405573,18.610209,-4.140715,6.027816,-20.288145,0
1,5.161836,-6.514011,-7.474612,-4.544336,-13.081404,1.637562,-1.094672,-1.253545,-2.955342,-10.9582,...,-4.840233,-0.514308,4.613289,2.391302,-4.795664,4.208278,-2.017168,-8.510424,10.806639,0
2,-2.97155,-21.508184,-1.125457,1.524129,3.027444,1.045879,1.55105,1.512075,-1.955564,3.683893,...,-0.385303,2.647917,-2.200556,1.058671,1.076312,-7.802389,-7.553953,0.636639,14.27495,0
3,6.724419,0.566489,0.509764,-4.524162,10.367236,2.08327,0.74179,-2.077787,-2.912744,-4.040637,...,4.731346,15.378418,-14.031666,2.65941,5.12362,-8.500321,3.41796,-14.79849,-6.1328,1
4,-2.213659,-4.678213,-0.135845,2.375933,0.916649,1.027195,-0.353265,-0.220609,-3.416823,-5.964181,...,1.59833,-4.996614,4.504269,1.918961,-2.076223,0.154039,-2.016779,10.803205,5.942927,0


In [5]:
data.target.value_counts()

0    4963
1    2000
Name: target, dtype: int64

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [51]:
# train
X_data = data.drop('target', axis=1)
for col1, col2 in tqdm(list(combinations(X_data.columns, 2))):
  X_data[f'{col1}*{col2}'] = X_data[col1]*X_data[col2]
X_data.head()

  0%|          | 0/465 [00:00<?, ?it/s]

  


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26*27,26*28,26*29,26*30,27*28,27*29,27*30,28*29,28*30,29*30
0,-6.130724,7.429628,3.651437,-1.950971,-3.384415,1.095934,-3.077774,-2.619091,5.128201,1.228476,...,-26.158006,5.820076,-8.472535,28.516467,-77.059564,112.178913,-377.566616,-24.959465,84.007417,-122.293201
1,5.161836,-6.514011,-7.474612,-4.544336,-13.081404,1.637562,-1.094672,-1.253545,-2.955342,-10.9582,...,-20.181485,9.67366,40.813131,-51.825007,-8.488804,-35.814227,45.477338,17.166956,-21.798808,-91.969078
2,-2.97155,-21.508184,-1.125457,1.524129,3.027444,1.045879,1.55105,1.512075,-1.955564,3.683893,...,-8.397804,-8.13041,0.685222,15.3643,58.938881,-4.967304,-111.378712,-4.80914,-107.832306,9.087988
3,6.724419,0.566489,0.509764,-4.524162,10.367236,2.08327,0.74179,-2.077787,-2.912744,-4.040637,...,-43.552416,17.51233,-75.821846,-31.422137,-29.053758,125.791914,52.130765,-50.580652,-20.961666,90.756178
4,-2.213659,-4.678213,-0.135845,2.375933,0.916649,1.027195,-0.353265,-0.220609,-3.416823,-5.964181,...,-0.319819,4.187283,-22.429866,-12.338842,-0.310663,1.664115,0.915443,-21.787678,-11.98557,64.202657


In [None]:
# test
X_test = pd.read_csv('test.csv')
for col1, col2 in tqdm(list(combinations(X_test.columns, 2))):
  X_test[f'{col1}*{col2}'] = X_test[col1]*X_test[col2]
X_test.head()

In [52]:
# normalization
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_data))
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,486,487,488,489,490,491,492,493,494,495
0,-2.002574,1.076649,1.3235,-0.692922,-0.464151,-0.999811,-1.15591,-0.923755,1.855145,0.180985,...,-1.234749,0.696323,-0.154589,1.482561,-4.53369,3.020885,-8.011366,-1.241582,4.438955,-2.61306
1,1.691025,-0.956572,-2.804358,-1.630647,-1.799562,0.227885,-0.438007,-0.42576,-1.085952,-2.053293,...,-0.882907,1.191261,2.492235,-2.642367,-0.400164,-0.951736,1.339236,0.943389,-1.011968,-1.958932
2,-0.969264,-3.142978,-0.448768,0.563626,0.418852,-1.11327,0.519771,0.582822,-0.722196,0.631156,...,-0.189194,-1.095418,0.337217,0.807297,3.664457,-0.123704,-2.127776,-0.196445,-5.444241,0.220992
3,2.202118,0.075886,0.157913,-1.623353,1.429645,1.238165,0.226811,-0.726349,-1.070454,-0.785043,...,-2.258769,2.198026,-3.771497,-1.594833,-1.639844,3.386303,1.486298,-2.570475,-0.96884,1.982674
4,-0.721371,-0.688881,-0.081614,0.871627,0.128166,-1.155619,-0.16961,-0.049063,-1.253857,-1.137701,...,0.286362,0.486614,-0.904149,-0.61505,0.092824,0.054305,0.354278,-1.07707,-0.50641,1.409882


In [70]:
X_test = pd.DataFrame(scaler.fit_transform(X_test))
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,486,487,488,489,490,491,492,493,494,495
0,-1.555662,-0.256397,0.000699,-2.392189,1.796684,0.132136,0.580354,-1.851579,0.963426,-0.527007,...,-1.673945,-0.158229,0.788813,4.355036,0.232183,-0.520303,-3.937643,-0.024545,-0.192581,1.144081
1,0.998291,1.144929,-0.912588,-0.474173,-2.308459,1.580129,-0.266707,-0.240752,-0.90586,-2.171932,...,-1.981674,2.077931,-1.241638,-2.153957,-2.330003,1.774127,2.393873,-1.708802,-2.11304,1.666238
2,-0.492302,-0.58364,-0.636056,0.037364,0.179105,-1.361813,1.842102,0.761388,1.024542,0.781989,...,-0.070335,0.135758,-0.94279,0.547056,-0.016087,0.841827,-0.006512,-0.48939,0.250417,-1.205871
3,-0.919907,-0.281494,0.460699,1.434917,-1.222606,1.15824,-1.193187,-0.774382,1.303721,0.830735,...,-0.392933,0.557084,-0.911879,0.761932,-0.522716,1.270521,-0.361692,-1.205689,0.734373,-1.286721
4,-0.687304,0.275409,0.935117,0.569364,0.744664,0.990835,0.402966,0.154753,0.448111,3.191817,...,-0.767407,-0.292942,-1.383047,0.166044,0.405884,2.076614,0.135887,0.494918,0.016041,-0.218238


Обучение, подбор моделей

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, data.target)

In [12]:
logreg = LogisticRegression()
naive = GaussianNB()
forest = RandomForestClassifier()
cat = CatBoostClassifier(verbose = 200)

In [None]:
# Выбираем модель
res = []
for i in [logreg, naive, forest, cat]:
  res.append((cross_val_score(i, X_scaled, data.target, scoring = 'f1', verbose=200), i))
res

In [None]:
# Тюним модель
res = []
for param in [100, 300, 800]:
  forest = RandomForestClassifier(n_estimators=param, verbose = 10)
  res.append(cross_val_score(forest, X_scaled, data.target, scoring='f1', cv=2))
res

In [66]:
res = []
for i in [0.001, 0.0001, 0.01, 0.1]:
  logreg = LogisticRegression(tol = i, C = 1)
  res.append(cross_val_score(logreg, X_scaled, data.target, scoring='f1', cv=3))
res

[array([0.98353293, 0.98570354, 0.98636364]),
 array([0.98353293, 0.98570354, 0.98636364]),
 array([0.98353293, 0.98570354, 0.98636364]),
 array([0.98426966, 0.98417483, 0.98484848])]

Стандартные параметры подходят лучше всего

Получаем итоговые результаты

In [None]:
forest = RandomForestClassifier()
forest.fit(X_scaled, data.target)
y_pred = forest.predict(X_test)

In [54]:
naive = GaussianNB()
naive.fit(X_scaled, data.target)
y_pred = naive.predict(X_test)

In [45]:
logreg = LogisticRegression()
logreg.fit(X_scaled, data.target)
y_pred = logreg.predict(X_test)

In [None]:
cat = CatBoostClassifier(iterations=100)
cat.fit(X_scaled, data.target)
y_pred = cat.predict(X_test)

In [73]:
ans = pd.DataFrame(y_pred, columns=['target'])

In [74]:
ans.to_csv('sub.csv', index=False)

In [57]:
ans.value_counts()

target
0         2732
1         1188
dtype: int64