In [83]:
import pandas as pd
import numpy as np

import yaml

import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss, precision_recall_curve

import matplotlib.pyplot as plt

from lightgbm import LGBMRanker

RAND=42

In [2]:
def extract_words_in_quotes(string):
    return set(re.findall(r"'([^']*)'", string))

In [55]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

In [9]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

baseline = config["baseline"]

# Baseline

In [4]:
df = pd.read_csv("eda.csv")
df[:5]

Unnamed: 0,index,purchase,min_publish_date,price,customer,supplier,is_winner,reg_code_small,tokens,month,flag_won
0,8,3,2021-02-02,290000.0,2,1,1,58.2_2_1,"frozenset({'год', 'заказчик', 'оказание', 'мун...",2,0
1,8151,2834,2021-02-02,105000.0,218,1,1,62.0_2_1,"frozenset({'замена', 'подобный', 'ш.', 'объект...",2,0
2,457484,154949,2021-02-16,98967.5,11235,1,1,63.9_2_1,"frozenset({'сточный', 'вода', 'рассказово', 'п...",2,0
3,437783,147950,2021-03-10,77460.03,11061,1,1,62.0_2_0,"frozenset({'год', 'центр', 'поставка', 'вентил...",3,0
4,488119,165990,2021-03-26,138000.0,11558,1,1,62.0_2_0,"frozenset({'замена', 'стартер', 'резьбовой', '...",3,0


In [5]:
df = df.drop(columns=['index'])
df = df.set_index('purchase')

In [6]:
df['tokens'] = df['tokens'].apply(extract_words_in_quotes)

In [10]:
df = df.astype(baseline['change_type_columns'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 958129 entries, 3 to 464151
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   min_publish_date  958129 non-null  datetime64[ns]
 1   price             958129 non-null  float64       
 2   customer          958129 non-null  category      
 3   supplier          958129 non-null  category      
 4   is_winner         958129 non-null  int64         
 5   reg_code_small    958129 non-null  category      
 6   tokens            958129 non-null  object        
 7   month             958129 non-null  int64         
 8   flag_won          958129 non-null  int64         
dtypes: category(3), datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 62.2+ MB


In [12]:
df_train = df[df['min_publish_date'] < '2022-11-01']
df_train[:5]

Unnamed: 0_level_0,min_publish_date,price,customer,supplier,is_winner,reg_code_small,tokens,month,flag_won
purchase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,2021-02-02,290000.0,2,1,1,58.2_2_1,"{нужда, экземпляр, услуга, технический, год, я...",2,0
2834,2021-02-02,105000.0,218,1,1,62.0_2_1,"{транспортный, автомагистраль, услуга, парковы...",2,0
154949,2021-02-16,98967.5,11235,1,1,63.9_2_1,"{переулок, прокладка, реконструкция, работа, р...",2,0
147950,2021-03-10,77460.03,11061,1,1,62.0_2_0,"{нужда, гбуз, центр, поставка, вентиляция, год...",3,0
165990,2021-03-26,138000.0,11558,1,1,62.0_2_0,"{патрон, сдо-5, защелки, стартер, восстановлен...",3,0


In [34]:
df_test = df[df['min_publish_date'] >= '2022-11-01']
df_test[:5]

Unnamed: 0_level_0,min_publish_date,price,customer,supplier,is_winner,reg_code_small,tokens,month,flag_won
purchase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1064,2022-11-15,181720.0,70,1,1,58.2_2_1,"{гбук, услуга, фонд, работа, государственный, ...",11,1
1729,2022-12-06,167448.0,105,1,1,62.0_2_1,"{нужда, многолетник, формовочный, текущий, мца...",12,0
2430,2022-12-08,200248.16,201,1,1,62.0_2_1,"{решётка, столб, мм, столбик, ограждение, сетк...",12,1
156898,2022-12-12,190740.0,11266,1,1,62.0_2_0,"{журнальный, нужда, гбуз, паллиативный, 2021, ...",12,0
412017,2022-11-07,2886156.0,9651,100009,0,27.4_77_1,"{метронидазол, гбуз, дексаметазон, поставка, х...",11,1


In [36]:
df_test_Y = df_test.reset_index().groupby('supplier')['purchase'].apply(list).to_frame(name='purchases')
df_test_Y['is_winner'] = df_test.groupby('supplier')['is_winner'].apply(list)
df_test_Y[:5]

Unnamed: 0_level_0,purchases,is_winner
supplier,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[1064, 1729, 2430, 156898]","[1, 1, 1, 1]"
2,"[141478, 536503]","[0, 1]"
3,[149878],[0]
4,[],[]
5,[154789],[1]


In [37]:
df_test_X = df_test.drop(columns=['is_winner', 'supplier'])

In [38]:
df_test_X[:5]

Unnamed: 0_level_0,min_publish_date,price,customer,reg_code_small,tokens,month,flag_won
purchase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1064,2022-11-15,181720.0,70,58.2_2_1,"{гбук, услуга, фонд, работа, государственный, ...",11,1
1729,2022-12-06,167448.0,105,62.0_2_1,"{нужда, многолетник, формовочный, текущий, мца...",12,0
2430,2022-12-08,200248.16,201,62.0_2_1,"{решётка, столб, мм, столбик, ограждение, сетк...",12,1
156898,2022-12-12,190740.0,11266,62.0_2_0,"{журнальный, нужда, гбуз, паллиативный, 2021, ...",12,0
412017,2022-11-07,2886156.0,9651,27.4_77_1,"{метронидазол, гбуз, дексаметазон, поставка, х...",11,1


In [75]:
df_temp = df_train[df_train['reg_code_small'].isin(df_train[df_train['supplier'] == 1]['reg_code_small'].unique())]
df_temp['target'] = df_temp['supplier'].apply(lambda x: 1 if x == 1 else 0)
df_temp = df_temp.drop(columns=['is_winner', 'supplier', 'tokens', 'min_publish_date', 'customer', 'reg_code_small'])

x_train, x_test, y_train, y_test = train_test_split(df_temp[df_temp.columns[:-1]],
                                                    df_temp[df_temp.columns[-1]],
                                                    test_size=0.3,
                                                    stratify=df_temp[df_temp.columns[-1]],
                                                    random_state=RAND)

mms = MinMaxScaler()
x_train = mms.fit_transform(x_train)
x_test = mms.transform(x_test)

model = LogisticRegression(random_state=RAND, class_weight='balanced')
model.fit(x_train, y_train)

y_score = model.predict_proba(x_test)

precision, recall, thresholds = precision_recall_curve(y_test, y_score[:, 1])
optimal_threshold = thresholds[np.argwhere(recall==1).flatten()[-1]]

y_pred = np.where(y_score[:, 1] >= optimal_threshold, 1, 0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['target'] = df_temp['supplier'].apply(lambda x: 1 if x == 1 else 0)
