In [2]:
print("Hello, world")

Hello, world


In [3]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_curve, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## Предварительная настройка

In [4]:
df_base = pd.read_parquet("../tmp/cleared_df_base_new_features.parquet")
df_train = pd.read_parquet("../tmp/cleared_df_train_new_features.parquet")
df_validation = pd.read_parquet("../tmp/cleared_df_validation_new_features.parquet")
df_validation_target = pd.read_csv("../data/validation_answer.csv")
df_validation_target.set_index("Id", inplace=True)
df_ptargets = pd.read_parquet("../tmp/predicted_targets.parquet")
df_validation_predicts = pd.read_parquet("../tmp/predicted_validations.parquet")




In [5]:
def checkRobustScaler(base, train, validation):
    scaler = RobustScaler()
    scaler.fit(base)

    tbase = scaler.transform(base)
    tbase = pd.DataFrame(tbase, columns=base.columns)
    tbase["Id"] = base.index
    tbase = tbase.set_index(["Id"])
    
    ttrain = scaler.transform(train[base.columns])
    ttrain = pd.DataFrame(ttrain, columns=base.columns)
    ttrain["Id"] = train.index
    ttrain = ttrain.set_index(["Id"])
    ttrain["Target"] = train["Target"]

    tvalidation = scaler.transform(validation[base.columns])
    tvalidation = pd.DataFrame(tvalidation, columns=base.columns)
    tvalidation["Id"] = validation.index
    tvalidation = tvalidation.set_index(["Id"])

    return tbase, ttrain, tvalidation, scaler

In [6]:
df_base, df_train, df_validation, _ = checkRobustScaler(df_base, df_train, df_validation)

## Ранжирование
### Формирование тренировочной выборки

Первым делом необходимо составить новый датафрейм состоящий из двух половин:

1) Искомые векторы
2) Несколько кандидатов на сопоставление для каждого искомого вектора.

Последним столбцом для этого вектора станет matching содержащий правильный ответ: 1 - вектор подходит, 0 - вектор не подходим.

In [7]:
exploded_predictions = df_ptargets["predictions"].explode()

In [33]:
prepare_train = pd.merge(df_ptargets, exploded_predictions, left_index=True, right_index=True)[["Target", "predictions_y"]]
prepare_train = prepare_train.rename(columns={"predictions_y":"predictions"})
prepare_train["is_match"] = prepare_train.apply(lambda x: int(x["Target"] == x["predictions"]), axis=1)
prepare_train.head(10)

Unnamed: 0_level_0,Target,predictions,is_match
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-query,675816-base,500710-base,0
0-query,675816-base,675816-base,1
0-query,675816-base,2725256-base,0
0-query,675816-base,13406-base,0
0-query,675816-base,170933-base,0
0-query,675816-base,297061-base,0
0-query,675816-base,361564-base,0
0-query,675816-base,823686-base,0
0-query,675816-base,926408-base,0
0-query,675816-base,1396458-base,0


In [9]:
df_ranging = prepare_train.copy()
df_ranging.pop("Target")

Id
0-query         675816-base
0-query         675816-base
0-query         675816-base
0-query         675816-base
0-query         675816-base
                   ...     
99999-query    2769109-base
99999-query    2769109-base
99999-query    2769109-base
99999-query    2769109-base
99999-query    2769109-base
Name: Target, Length: 20000000, dtype: object

In [10]:
df_ranging_with_train = pd.merge(df_ranging, df_train, right_index=True, left_index=True)
df_ranging_with_train.head()

Unnamed: 0_level_0,predictions,is_match,0,1,2,3,4,5,7,8,...,71,6,21,25,33,44,59,65,70,Target
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-query,500710-base,0,0.956813,1.468961,0.034035,-1.398175,1.222708,-0.390937,0.203191,-1.744734,...,0.404513,0.127371,-1.165876,0.0,0.676024,0.310089,1.44861,-0.74787,1.106625,675816-base
0-query,675816-base,1,0.956813,1.468961,0.034035,-1.398175,1.222708,-0.390937,0.203191,-1.744734,...,0.404513,0.127371,-1.165876,0.0,0.676024,0.310089,1.44861,-0.74787,1.106625,675816-base
0-query,2725256-base,0,0.956813,1.468961,0.034035,-1.398175,1.222708,-0.390937,0.203191,-1.744734,...,0.404513,0.127371,-1.165876,0.0,0.676024,0.310089,1.44861,-0.74787,1.106625,675816-base
0-query,13406-base,0,0.956813,1.468961,0.034035,-1.398175,1.222708,-0.390937,0.203191,-1.744734,...,0.404513,0.127371,-1.165876,0.0,0.676024,0.310089,1.44861,-0.74787,1.106625,675816-base
0-query,170933-base,0,0.956813,1.468961,0.034035,-1.398175,1.222708,-0.390937,0.203191,-1.744734,...,0.404513,0.127371,-1.165876,0.0,0.676024,0.310089,1.44861,-0.74787,1.106625,675816-base


In [11]:
df_ranging_with_train = pd.merge(df_ranging_with_train, df_base, left_on="predictions", right_index=True, suffixes=('',"_base"))

In [12]:
temp = df_ranging_with_train.copy()
df_ranging_target = temp.pop("is_match")
df_ranging_features = temp.drop(columns=["predictions", "Target"])

In [36]:
df_ranging_target = df_ranging_target.astype('int8')

In [37]:
df_ranging_target

Id
0-query        0
10300-query    0
12012-query    0
13887-query    0
18253-query    0
              ..
99994-query    0
99994-query    0
99996-query    0
99996-query    0
99996-query    0
Name: is_match, Length: 20000000, dtype: int8

In [None]:
del df_ranging_with_train

del df_ranging

del df_train

del prepare_train

### Validation dataset

In [17]:
exploded_predictions_validation = df_validation_predicts.copy()["predictions"].explode()
exploded_predictions_validation

Id
100000-query     907667-base
100000-query    1300806-base
100000-query    1542803-base
100000-query     598166-base
100000-query    2756886-base
                    ...     
199999-query    1639086-base
199999-query    1643873-base
199999-query    1649728-base
199999-query    1662028-base
199999-query    1694474-base
Name: predictions, Length: 20000000, dtype: object

In [18]:
df_validation

Unnamed: 0_level_0,0,1,2,3,4,5,7,8,9,10,...,69,71,6,21,25,33,44,59,65,70
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000-query,0.853586,-0.656829,0.612866,0.784502,-0.018218,-0.240307,1.381604,-1.297504,-0.565011,0.177526,...,-0.031716,-1.224035,0.127371,-1.165876,0.0,0.676024,0.310089,1.448610,-0.747870,1.106625
100001-query,0.960482,0.702699,0.010976,0.465026,0.041653,-0.477616,0.501399,0.380092,-0.823477,-0.216906,...,0.264382,0.421479,0.645274,-1.798307,0.0,0.917652,0.984351,-0.369892,-0.081698,0.502892
100002-query,0.652385,-1.777674,0.723699,-0.879850,-0.249358,0.245117,0.774008,0.869364,0.531550,-1.420424,...,0.899212,-0.399942,1.036803,0.000000,0.0,-1.424487,0.366366,0.422263,0.390129,0.000000
100003-query,-0.386152,1.195725,1.074489,-0.048298,-0.455432,-1.587295,-0.462637,-1.042815,-0.565117,0.495986,...,0.977133,-0.729207,-1.597013,0.000000,0.0,-1.424487,0.348640,-0.152976,0.462025,0.000000
100004-query,0.198131,-1.242776,0.556516,-0.907091,0.866076,1.120492,0.157958,0.590553,-0.250556,1.183802,...,-0.514030,-1.371098,0.173708,0.000000,0.0,-0.530862,-0.494542,0.304914,-0.911585,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995-query,1.170070,-0.206277,1.025210,1.153250,-0.514679,0.607969,1.013998,-0.136983,-0.232336,-0.659871,...,-0.134600,-0.611465,-0.152810,0.000000,0.0,-0.530862,0.046929,-0.482846,0.000765,0.000000
199996-query,-0.253035,0.416130,1.151852,0.796068,-1.460035,-0.454642,-0.781498,-0.036944,-0.578852,-0.223344,...,-1.067438,-0.266284,0.590352,0.000000,0.0,0.900627,0.387660,-0.567673,-0.603674,-0.863420
199997-query,0.798816,0.820355,1.331185,0.445897,0.538786,0.221994,-0.052558,0.501713,0.530557,-0.021421,...,-0.649532,0.366577,-0.009113,-1.302426,0.0,0.099872,0.248006,-1.073456,0.049112,0.788698
199998-query,1.564024,0.537205,0.245098,0.725535,-0.586622,0.909480,-1.172819,-0.021251,-1.059704,-0.029611,...,1.205213,-0.220387,-0.599704,0.000000,0.0,0.577276,-0.836854,0.565303,-0.675991,0.000000


In [19]:
validation_with_predict = pd.merge(df_validation, exploded_predictions_validation, left_index=True, right_index=True)
validation_with_predict = pd.merge(validation_with_predict, df_base, left_on="predictions", right_index=True, suffixes=('',"_base"))
validation_with_predict = pd.merge(validation_with_predict, df_validation_target, left_index=True, right_index=True)
validation_with_predict[["predictions", "Expected"]]

Unnamed: 0_level_0,predictions,Expected
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
100000-query,907667-base,2676668-base
100000-query,1300806-base,2676668-base
100000-query,1542803-base,2676668-base
100000-query,598166-base,2676668-base
100000-query,2756886-base,2676668-base
...,...,...
199999-query,1211728-base,336472-base
199999-query,4300260-base,336472-base
199999-query,2639563-base,336472-base
199999-query,3591968-base,336472-base


In [38]:
validation_with_predict["is_matching"] = validation_with_predict.apply(lambda x: int(x["Expected"] == x["predictions"]), axis=1)
validation_with_predict["is_matching"] = validation_with_predict["is_matching"].astype('int8')
validation_with_predict.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000000 entries, 100000-query to 199999-query
Columns: 147 entries, 0 to is_matching
dtypes: float32(144), int8(1), object(2)
memory usage: 11.2+ GB


In [39]:
#temp = validation_with_predict.copy()
validation_with_predict_target = validation_with_predict.pop("is_matching")
validation_with_predict_features = validation_with_predict.drop(columns=["predictions", "Expected"])

In [40]:
validation_with_predict

Unnamed: 0_level_0,0,1,2,3,4,5,7,8,9,10,...,71_base,6_base,21_base,25_base,33_base,44_base,59_base,65_base,70_base,Expected
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000-query,0.853586,-0.656829,0.612866,0.784502,-0.018218,-0.240307,1.381604,-1.297504,-0.565011,0.177526,...,-0.702531,0.059143,0.000000,0.000000,-1.424487,0.771361,0.319378,0.330624,1.804213,2676668-base
100000-query,0.853586,-0.656829,0.612866,0.784502,-0.018218,-0.240307,1.381604,-1.297504,-0.565011,0.177526,...,-1.257295,0.548947,0.000000,0.000000,-0.469311,0.143271,-0.122453,-1.683778,1.501388,2676668-base
100000-query,0.853586,-0.656829,0.612866,0.784502,-0.018218,-0.240307,1.381604,-1.297504,-0.565011,0.177526,...,-0.750859,-0.164953,-0.888499,0.000000,0.000000,0.771361,0.550960,-0.886501,0.943525,2676668-base
100000-query,0.853586,-0.656829,0.612866,0.784502,-0.018218,-0.240307,1.381604,-1.297504,-0.565011,0.177526,...,-0.380756,-0.518201,-0.888499,0.000000,0.000000,0.771361,-0.262996,-1.078255,0.702799,2676668-base
100000-query,0.853586,-0.656829,0.612866,0.784502,-0.018218,-0.240307,1.381604,-1.297504,-0.565011,0.177526,...,-1.102533,0.955181,-0.975797,0.000000,-0.683738,0.562298,1.155880,-1.240554,0.589936,2676668-base
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199999-query,1.345030,-0.178396,0.276827,-0.358665,-0.048915,0.323115,-0.734725,-0.518630,-0.812421,-0.012155,...,-0.481667,0.049632,-0.813250,-2.898052,-0.295251,-0.620245,-0.798755,-1.518359,2.240414,336472-base
199999-query,1.345030,-0.178396,0.276827,-0.358665,-0.048915,0.323115,-0.734725,-0.518630,-0.812421,-0.012155,...,-0.345055,0.047311,-0.950591,-1.169161,-0.168675,0.112891,-1.187357,-0.381964,0.000000,336472-base
199999-query,1.345030,-0.178396,0.276827,-0.358665,-0.048915,0.323115,-0.734725,-0.518630,-0.812421,-0.012155,...,-0.764711,0.910881,0.000000,-1.555251,-0.431130,0.605086,-0.084612,-0.274142,0.814018,336472-base
199999-query,1.345030,-0.178396,0.276827,-0.358665,-0.048915,0.323115,-0.734725,-0.518630,-0.812421,-0.012155,...,0.064853,0.970682,-0.888499,0.000000,0.000000,0.745191,-0.779386,-0.785652,1.350116,336472-base


## Логистическая регрессия

In [41]:
def checkModel(model, train_features, train_target, valid_feauters, valid_target):
    model.fit(train_features, train_target)

    valid_predictions = model.predict(valid_feauters)
    valid_probabilities = model.predict_proba(valid_feauters)[:, 1]
    valid_accuracy = accuracy_score(valid_target, valid_predictions)
    
    print('valid_accuracy: {:.2%}'.format(valid_accuracy))
    ConfusionMatrixDisplay.from_estimator(model, valid_feauters, valid_target, cmap=plt.cm.Blues, normalize='true'); # матрица ошибок

    # ROC-кривая
    fpr, tpr, thresholds = roc_curve(valid_target, valid_probabilities)
    
    plt.figure(figsize = (6, 6))
    plt.step(fpr, tpr, where='post')
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC-кривая')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.grid(True)
    plt.show()

    return valid_probabilities


In [48]:
df_ranging_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000000 entries, 0-query to 99996-query
Columns: 144 entries, 0 to 70_base
dtypes: float32(144)
memory usage: 10.9+ GB


In [49]:
df_ranging_target.info()

<class 'pandas.core.series.Series'>
Index: 20000000 entries, 0-query to 99996-query
Series name: is_match
Non-Null Count     Dtype
--------------     -----
20000000 non-null  int8 
dtypes: int8(1)
memory usage: 203.9+ MB


In [None]:
logreg_valid_probabilities = checkModel(LogisticRegression(solver="liblinear", class_weight='balanced', random_state=42, C=1.5), df_ranging_features, df_ranging_target, validation_with_predict_features, validation_with_predict_target)

In [None]:
print(logreg_valid_probabilities)

In [None]:
logreg_valid_proba_series = pd.Series(logreg_valid_probabilities)

logres_valid_candidates = []
for i in range(0, len(logreg_valid_proba_series), 100):
    query = logreg_valid_proba_series[i : i+100]
    index = query.sort_values(ascending=False)[0:5].index
    logres_valid_candidates.append(index)

# контроль
print('Кол-во предсказаний:', len(logres_valid_candidates))
print('Кол-во кандидатов:', len(logres_valid_candidates[0]))


### ---------------------------------------------------------------------------------------------------------

In [None]:
acc = 0
for target, candidates in zip(df_validation_target.values.tolist(), logres_valid_candidates):
    acc += int(target in validation_with_predict.iloc[candidates.tolist()]["predictions"].values)

print('valid_accuracy@{}: {:.2f}%'.format(5, acc / len(logres_valid_candidates) * 100))


In [None]:
acc

### Catboost

In [50]:
cbc = CatBoostClassifier(auto_class_weights = 'Balanced', verbose=100, random_state=42, n_estimators=1000, bootstrap_type="Bernoulli")

In [51]:
cbc.fit(df_ranging_features.values, df_ranging_target.values)

Learning rate set to 0.5
0:	learn: 0.6421981	total: 1.44s	remaining: 24m
100:	learn: 0.1458611	total: 2m 4s	remaining: 18m 23s
200:	learn: 0.1093717	total: 3m 57s	remaining: 15m 43s
300:	learn: 0.0840929	total: 5m 46s	remaining: 13m 25s
400:	learn: 0.0714349	total: 7m 37s	remaining: 11m 23s
500:	learn: 0.0608779	total: 9m 27s	remaining: 9m 24s
600:	learn: 0.0509595	total: 11m 18s	remaining: 7m 30s
700:	learn: 0.0456659	total: 13m 11s	remaining: 5m 37s
800:	learn: 0.0403791	total: 15m 6s	remaining: 3m 45s
900:	learn: 0.0363781	total: 17m	remaining: 1m 52s
999:	learn: 0.0323958	total: 18m 51s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1dad02f00a0>

In [52]:
predictions = cbc.predict_proba(validation_with_predict_features.values)

In [53]:
df_predictions = pd.DataFrame(predictions)
df_predictions.drop([0], axis=1, inplace=True)
df_predictions.columns=['predictions']

0.999999999844134

In [64]:
cbc_valid_proba_series = pd.Series(df_predictions['predictions'])

cbc_valid_candidates = []
for i in range(0, len(cbc_valid_proba_series), 200):
    query = cbc_valid_proba_series[i : i+200]
    index = query.sort_values(ascending=False)[0:5].index
    cbc_valid_candidates.append(index)

# контроль
print('Кол-во предсказаний:', len(cbc_valid_candidates))
print('Кол-во кандидатов:', len(cbc_valid_candidates[0]))


acc = 0
for target, candidates in zip(df_validation_target.values.tolist(), cbc_valid_candidates):
    acc += int(target in validation_with_predict.iloc[candidates.tolist()]["predictions"].values)

print('valid_accuracy@{}: {:.2f}%'.format(5, acc / len(cbc_valid_candidates) * 100))



Кол-во предсказаний: 100000
Кол-во кандидатов: 5
valid_accuracy@5: 57.99%
