In [1]:
import pandas as pd
import numpy as np

# Загрузка данных
df = pd.read_parquet("Задание/train_data.pqt")

# Создаем словарь для заполнения index_city_code
city_code_dict = df.dropna(subset=['index_city_code']).drop_duplicates('city').set_index('city')['index_city_code'].to_dict()

# Функция для заполнения index_city_code на основе city
def fill_index_city_code(row):
    if pd.isnull(row['index_city_code']) and row['city'] in city_code_dict:
        return city_code_dict[row['city']]
    else:
        return row['index_city_code']

# Применяем функцию к каждой строке DataFrame
df['index_city_code'] = df.apply(fill_index_city_code, axis=1)

# Определяем столбцы для исключения из обработки
columns_to_exclude = ['index_city_code']
numeric_columns = df.select_dtypes(include=[np.number]).drop(columns=columns_to_exclude, errors='ignore').columns
non_numeric_columns = df.select_dtypes(exclude=[np.number]).drop(columns=columns_to_exclude, errors='ignore').columns

# Фильтруем строки, содержащие интересующие нас месяцы, и применяем операции к ним
months = ['month_1', 'month_2', 'month_3']
mask = df['date'].isin(months)
if mask.any():
    # Применяем интерполяцию ко всем числовым столбцам для отфильтрованных строк
    df.loc[mask, numeric_columns] = df.loc[mask, numeric_columns].interpolate(method='linear')
    
    # Заполнение нечисловых столбцов вперёд для отфильтрованных строк
    df.loc[mask, non_numeric_columns] = df.loc[mask, non_numeric_columns].fillna(method='ffill')
else:
    print('Упс, указанные месяцы не найдены в столбце "date".')



output_file_path = "results/updated_train_data.csv"
df.to_csv(output_file_path, index=False)
print(f'DataFrame сохранен в файл: {output_file_path}')

DataFrame сохранен в файл: results/updated_train_data.csv


In [2]:
import pandas as pd
import numpy as np

# Загрузка данных
df = pd.read_parquet("Задание/test_data.pqt")

city_code_dict = df.dropna(subset=['index_city_code']).drop_duplicates('city').set_index('city')['index_city_code'].to_dict()

# Функция для заполнения index_city_code на основе city
def fill_index_city_code(row):
    if pd.isnull(row['index_city_code']) and row['city'] in city_code_dict:
        return city_code_dict[row['city']]
    else:
        return row['index_city_code']

# Применяем функцию к каждой строке DataFrame
df['index_city_code'] = df.apply(fill_index_city_code, axis=1)

# Выбор числовых столбцов, исключая 'index_city_code'
numeric_columns = df.select_dtypes(include=[np.number]).drop(columns=columns_to_exclude, errors='ignore').columns

# Выбор нечисловых столбцов, исключая 'index_city_code'
non_numeric_columns = df.select_dtypes(exclude=[np.number]).drop(columns=columns_to_exclude, errors='ignore').columns

# Проверяем, есть ли 'month_6' в столбце 'date'
if 'month_6' in df['date'].values:
    # Применяем интерполяцию ко всем числовым столбцам, исключая 'index_city_code'
    df.loc[:, numeric_columns] = df.loc[:, numeric_columns].interpolate(method='linear')
    
    # Заполнение нечисловых столбцов, исключая 'index_city_code', вперёд
    df.loc[:, non_numeric_columns] = df.loc[:, non_numeric_columns].fillna(method='ffill')
else:
    print('Упс, "month_6" не найден в столбце "date".')

output_file_path = "results/updated_test_data.csv"
df.to_csv(output_file_path, index=False)
print(f'DataFrame сохранен в файл: {output_file_path}')

DataFrame сохранен в файл: results/updated_test_data.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss

# Загрузка данных из CSV файла
df = pd.read_csv("results/updated_train_data.csv")

# Определение числовых и категориальных признаков
numeric_features = ['balance_amt_avg', 'max_founderpres', 'sum_of_paym_2m', 'sum_of_paym_1y', 
                    'sum_a_oper_1m', 'sum_b_oper_1m', 'sum_c_oper_1m', 'sum_deb_d_oper_1m', 'sum_cred_d_oper_1m',
                    'sum_deb_e_oper_1m', 'sum_cred_e_oper_1m', 'sum_deb_f_oper_1m', 'sum_cred_f_oper_1m',
                    'sum_deb_g_oper_1m', 'sum_cred_g_oper_1m', 'sum_deb_h_oper_1m', 'sum_cred_h_oper_1m',
                    'sum_a_oper_3m', 'sum_b_oper_3m', 'sum_c_oper_3m', 'sum_deb_d_oper_3m', 'sum_cred_d_oper_3m',
                    'sum_deb_e_oper_3m', 'sum_cred_e_oper_3m', 'sum_deb_f_oper_3m', 'sum_cred_f_oper_3m',
                    'sum_deb_g_oper_3m', 'sum_cred_g_oper_3m', 'sum_deb_h_oper_3m', 'sum_cred_h_oper_3m']
categorical_features = ['start_cluster', 'channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'ft_registration_date', 'okved', 'segment']

# Пайплайн предобработки
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Создание и конфигурация модели логистической регрессии
log_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=10000, n_jobs=-1))
])

# Разделение данных
X = df[numeric_features + categorical_features]
y = df['end_cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение и предсказание
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

# Оценка
accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"Log Loss: {logloss}")

Accuracy: 0.6679166666666667
Log Loss: 0.9703665317602224




In [7]:
from sklearn.metrics import roc_auc_score

def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

cluster_weights = pd.read_excel("Задание/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

y_pred_proba = log_reg.predict_proba(X_test)
weighted_roc_auc(y_test, y_pred_proba, log_reg.classes_, weights_dict)


0.8684799360032465

In [29]:
def replace_missing_with_mode(df):
    # Получаем список столбцов с пропущенными значениями
    columns_with_missing_values = df.columns[df.isnull().any()].tolist()
    
    # Заменяем пропущенные значения на моду для каждого столбца
    for column in columns_with_missing_values:
        mode_value = df[column].mode()[0]  # Получаем моду столбца
        df[column].fillna(mode_value, inplace=True)  # Заменяем пропущенные значения модой
    
    return df

In [33]:
test_df = pd.read_csv("results/updated_test_data.csv")

# Избавляемся в моделе от пустых значений, заполняя их модой и выбираем только 6 месяц
test_df = replace_missing_with_mode(test_df)
test_df = test_df[test_df.date == 'month_6']


# Предсказываем значения
# log_reg.predict(test_df)
test_probabilities = log_reg.predict_proba(test_df)
test_probabilities

array([[9.82506554e-03, 3.85750432e-05, 1.40399353e-02, ...,
        2.34921861e-03, 9.21443003e-01, 1.38961907e-05],
       [1.02449871e-02, 5.46093109e-01, 4.39886335e-03, ...,
        1.25310840e-03, 4.03561164e-01, 7.32896510e-05],
       [5.64930853e-01, 1.73378921e-14, 3.94951514e-03, ...,
        7.81195192e-02, 8.12498598e-02, 8.80301140e-07],
       ...,
       [1.79791263e-02, 2.06984084e-01, 8.47213947e-03, ...,
        3.14374561e-03, 7.16515482e-01, 7.99096318e-05],
       [2.22140495e-02, 7.41810207e-01, 2.50165663e-03, ...,
        3.05611073e-03, 1.90108222e-01, 3.09388105e-04],
       [1.03872036e-02, 4.56641043e-01, 5.28150014e-03, ...,
        9.66652280e-04, 4.99868540e-01, 7.85152817e-05]])

In [46]:
# Создание DataFrame с вероятностями перехода в каждый кластер для каждой строчки
cluster_probabilities_df = pd.DataFrame(test_probabilities, columns=log_reg.classes_)

# Добавление целевой переменной к DataFrame с вероятностями
# cluster_probabilities_df = cluster_probabilities_df.reset_index(drop=True) # .astype(int)
cluster_probabilities_df['id'] = test_df['id'].reset_index(drop=True)
cluster_probabilities_df.to_csv('results/sample_submission.csv', encoding='utf-8', index=False)
cluster_probabilities_df

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ},id
0,0.009825,3.857504e-05,0.014040,0.027646,0.003922,0.000415,0.000462,0.000199,0.003167,0.008965,0.005587,0.000312,0.001595,0.000018,0.002349,0.921443,1.389619e-05,200000
1,0.010245,5.460931e-01,0.004399,0.010519,0.002681,0.000406,0.000487,0.000128,0.003497,0.011416,0.003362,0.000417,0.001421,0.000041,0.001253,0.403561,7.328965e-05,200001
2,0.564931,1.733789e-14,0.003950,0.189217,0.012915,0.003325,0.004981,0.006748,0.023666,0.003765,0.023499,0.002003,0.001592,0.000039,0.078120,0.081250,8.803011e-07,200002
3,0.024838,4.710185e-01,0.003606,0.009958,0.002600,0.000960,0.000459,0.000093,0.006418,0.033287,0.003878,0.000399,0.002937,0.000036,0.000713,0.438744,5.534539e-05,200003
4,0.042701,5.777475e-01,0.002485,0.008398,0.001208,0.001603,0.000763,0.000303,0.002313,0.088135,0.004523,0.000416,0.001988,0.000117,0.001302,0.265770,2.250203e-04,200004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.023574,1.994607e-01,0.017666,0.027742,0.008802,0.000275,0.002018,0.000273,0.007455,0.008702,0.006888,0.002320,0.001389,0.000093,0.001916,0.691166,2.602203e-04,299995
99996,0.020333,1.989606e-01,0.020152,0.024191,0.010095,0.000300,0.001845,0.000302,0.006066,0.005166,0.008237,0.003034,0.001635,0.000110,0.002156,0.697120,2.954727e-04,299996
99997,0.017979,2.069841e-01,0.008472,0.019465,0.004781,0.000353,0.000494,0.000189,0.005862,0.006863,0.006295,0.000456,0.002016,0.000053,0.003144,0.716515,7.990963e-05,299997
99998,0.022214,7.418102e-01,0.002502,0.011207,0.001427,0.001104,0.000960,0.000494,0.002816,0.015857,0.004202,0.001009,0.000784,0.000139,0.003056,0.190108,3.093881e-04,299998


In [41]:
test_df['id']

2         200000
5         200001
8         200002
11        200003
13        200004
           ...  
290108    299995
290111    299996
290114    299997
290116    299998
290119    299999
Name: id, Length: 100000, dtype: int64