In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Примечание:
В итоговой выборке метрик нельзя использовать start_cluster, так как его нет в проверочных данных в 6ом месяце. Орентироваться на стартовый кластер бессмысленно

In [2]:
# Считываем данные из файла
df = pd.read_parquet("Задание/train_data.pqt")

# Выбираем метрики для модели
col = ['balance_amt_avg', 'channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'ft_registration_date', 'max_founderpres', 'okved', 'segment', 'sum_of_paym_2m', 'sum_of_paym_1y', 'sum_a_oper_3m', 'sum_c_oper_3m', 'sum_cred_e_oper_3m', 'sum_cred_g_oper_3m', 'sum_cred_h_oper_3m']
# col = ['start_cluster'] # Тестовая (чем меньше метрик, тем быстрее модель)

# Приводим данные к числовому виду
X = pd.DataFrame()
for i in col:
    if df[i].dtype.name != 'object':
        X[i]=df[i].copy()
        X.loc[X[i].isna(), i]=X[i].median()
    else:
        X[i]=pd.factorize(df[i])[0]

# Разделяем данные на признаки (X) и целевую переменную (y)
y = df['end_cluster']  # целевая переменная

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем и Обучаем модель случайного леса
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Считаем значимость метрик
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
ar_f=[]
for f, idx in enumerate(indices):
    ar_f.append([round(importances[idx],4), col[idx]])
ar_f.sort(reverse=True)
for i in ar_f:
    print(i)

# Предсказываем значения на тестовой выборке
y_pred = model.predict(X_test)

# Оцениваем качество модели
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




# Считываем данные из тестовой метрики
test_df = pd.read_parquet("Задание/test_data.pqt")
test_df = test_df[test_df.date == 'month_6']

# Преобразовываем тестовую таблицу к численным значениям
X = pd.DataFrame()
for i in col:
    if test_df[i].dtype.name != 'object':
        X[i]=test_df[i].copy()
        X.loc[X[i].isna(), i]=X[i].median()
    else:
        X[i]=pd.factorize(test_df[i])[0]

# Предсказываем значения
model.predict(X)
test_probabilities = model.predict_proba(X)

test_probabilities

[0.119, 'balance_amt_avg']
[0.1158, 'sum_cred_e_oper_3m']
[0.1033, 'ft_registration_date']
[0.0889, 'sum_of_paym_1y']
[0.0851, 'okved']
[0.076, 'sum_of_paym_2m']
[0.0649, 'sum_cred_h_oper_3m']
[0.0648, 'max_founderpres']
[0.0639, 'channel_code']
[0.059, 'ogrn_month']
[0.0375, 'ogrn_year']
[0.0301, 'sum_cred_g_oper_3m']
[0.0257, 'city_type']
[0.0239, 'segment']
[0.0228, 'sum_c_oper_3m']
[0.0192, 'sum_a_oper_3m']
Accuracy: 0.745725


array([[0.16, 0.02, 0.04, ..., 0.03, 0.33, 0.  ],
       [0.03, 0.34, 0.  , ..., 0.02, 0.49, 0.  ],
       [0.3 , 0.03, 0.02, ..., 0.03, 0.34, 0.  ],
       ...,
       [0.22, 0.04, 0.  , ..., 0.01, 0.48, 0.  ],
       [0.1 , 0.21, 0.03, ..., 0.  , 0.45, 0.  ],
       [0.01, 0.04, 0.  , ..., 0.  , 0.74, 0.  ]])

In [15]:
# Создание DataFrame с вероятностями перехода в каждый кластер для каждой строчки
cluster_probabilities_df = pd.DataFrame(test_probabilities, columns=model.classes_)

# Добавление целевой переменной к DataFrame с вероятностями
# cluster_probabilities_df['end_cluster'] = y_test.values
cluster_probabilities_df['id'] = test_df['id']
cluster_probabilities_df = cluster_probabilities_df[['id', '{other}', '{}', '{α, β}', '{α, γ}', '{α, δ}', '{α, ε, η}',
       '{α, ε, θ}', '{α, ε, ψ}', '{α, ε}', '{α, η}', '{α, θ}', '{α, λ}',
       '{α, μ}', '{α, π}', '{α, ψ}', '{α}', '{λ}']]
cluster_probabilities_df.head(9)
# probabilities = model.predict_proba(X_test)

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.16,0.02,0.04,0.22,0.01,0.01,0.01,0.0,0.05,0.08,0.03,0.01,0.0,0.0,0.03,0.33,0.0
1,200001,0.03,0.34,0.0,0.02,0.01,0.0,0.0,0.0,0.01,0.08,0.0,0.0,0.0,0.0,0.02,0.49,0.0
2,200002,0.3,0.03,0.02,0.18,0.04,0.0,0.0,0.01,0.04,0.01,0.0,0.0,0.0,0.0,0.03,0.34,0.0
3,200003,0.01,0.33,0.0,0.02,0.01,0.0,0.0,0.0,0.01,0.04,0.01,0.0,0.0,0.0,0.01,0.56,0.0
4,200004,0.03,0.3,0.02,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.55,0.0
5,200005,0.21,0.04,0.02,0.15,0.05,0.0,0.03,0.01,0.01,0.05,0.04,0.01,0.0,0.0,0.03,0.35,0.0
6,200006,0.030042,0.397908,0.0,0.09,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.45205,0.0
7,200007,0.03,0.01,0.01,0.16,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.76,0.0
8,200008,0.03,0.28,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.06,0.03,0.0,0.0,0.0,0.0,0.56,0.0


In [16]:
cluster_probabilities_df.to_csv('results/sample_submission.csv', encoding='utf-8', index=False)

In [17]:
pd.read_csv('results/sample_submission.csv')

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.16,0.02,0.04,0.22,0.01,0.01,0.01,0.00,0.05,0.08,0.03,0.01,0.0,0.0,0.03,0.33,0.0
1,200001,0.03,0.34,0.00,0.02,0.01,0.00,0.00,0.00,0.01,0.08,0.00,0.00,0.0,0.0,0.02,0.49,0.0
2,200002,0.30,0.03,0.02,0.18,0.04,0.00,0.00,0.01,0.04,0.01,0.00,0.00,0.0,0.0,0.03,0.34,0.0
3,200003,0.01,0.33,0.00,0.02,0.01,0.00,0.00,0.00,0.01,0.04,0.01,0.00,0.0,0.0,0.01,0.56,0.0
4,200004,0.03,0.30,0.02,0.06,0.00,0.00,0.00,0.00,0.00,0.00,0.04,0.00,0.0,0.0,0.00,0.55,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.00,0.37,0.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,0.01,0.61,0.0
99996,299996,0.02,0.04,0.01,0.09,0.01,0.00,0.00,0.00,0.00,0.02,0.00,0.00,0.0,0.0,0.00,0.81,0.0
99997,299997,0.22,0.04,0.00,0.17,0.07,0.00,0.00,0.00,0.00,0.00,0.01,0.00,0.0,0.0,0.01,0.48,0.0
99998,299998,0.10,0.21,0.03,0.10,0.01,0.01,0.00,0.00,0.00,0.08,0.01,0.00,0.0,0.0,0.00,0.45,0.0


In [13]:
test_df.index = range(test_df.shape[0])
test_df

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.152800,0.549468,0.541020,0.387566,0.268543,0.836079,
1,200001,month_6,-0.156722,-0.204920,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,...,-0.028584,,,-0.165588,,,-0.201123,,,
2,200002,month_6,-0.048015,0.448252,-0.125995,-0.047215,channel_code_12,city_14,city_type_0,index_city_code_78,...,0.123154,0.946850,0.453739,2.614870,0.565087,0.818798,4.449125,0.258723,0.627287,
3,200003,month_6,-0.156579,-0.204813,-0.125501,-0.156115,channel_code_7,city_31,city_type_0,,...,-0.028584,,,-0.165588,,,-0.201123,,,
4,200004,month_6,-0.153379,-0.201932,-0.125995,-0.154155,channel_code_7,,,,...,-0.027573,0.944889,0.396267,-0.165324,0.547032,0.418798,-0.201123,0.250924,0.374540,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,month_6,-0.153707,-0.202806,-0.125498,-0.153234,channel_code_14,city_22,city_type_0,index_city_code_29,...,-0.028584,,,-0.165588,,,-0.201123,,,
99996,299996,month_6,-0.154929,-0.197878,-0.125873,-0.154459,channel_code_1,city_96,city_type_0,index_city_code_66,...,-0.028584,0.944497,0.384773,-0.155776,0.549755,0.507687,-0.191186,0.252657,0.440474,
99997,299997,month_6,-0.105294,-0.141429,-0.104590,-0.104671,channel_code_17,city_85,city_type_0,index_city_code_103,...,-0.028584,0.944497,0.384773,0.087901,0.551904,0.685465,0.090620,0.258723,0.594320,
99998,299998,month_6,-0.155350,-0.203711,-0.125995,-0.155980,channel_code_9,city_25,city_type_0,index_city_code_30,...,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,
