In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Считываем данные из файла
df = pd.read_parquet("Задание/train_data.pqt")

# Выбираем метрики для модели
col = ['balance_amt_avg', 'channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'ft_registration_date', 'max_founderpres', 'okved', 'segment', 'sum_of_paym_2m', 'sum_of_paym_1y', 'sum_a_oper_3m', 'sum_c_oper_3m', 'sum_cred_e_oper_3m', 'sum_cred_g_oper_3m', 'sum_cred_h_oper_3m', 'start_cluster']
# col = ['start_cluster'] # Тестовая (чем меньше метрик, тем быстрее модель)

# Приводим данные к числовому виду
X = pd.DataFrame()
for i in col:
    if df[i].dtype.name != 'object':
        X[i]=df[i].copy()
        X.loc[X[i].isna(), i]=X[i].median()
    else:
        X[i]=pd.factorize(df[i])[0]

# Разделяем данные на признаки (X) и целевую переменную (y)
y = df['end_cluster']  # целевая переменная

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем и Обучаем модель случайного леса
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#Считаем значимость метрик
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
ar_f=[]
for f, idx in enumerate(indices):
    ar_f.append([round(importances[idx],4), col[idx]])
ar_f.sort(reverse=True)
for i in ar_f:
    print(i)

# Предсказываем значения на тестовой выборке
y_pred = model.predict(X_test)

# Оцениваем качество модели
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




# Считываем данные из тестовой метрики
test_df = pd.read_parquet("Задание/test_data.pqt")

# Преобразовываем тестовую таблицу к численным значениям
X = pd.DataFrame()
for i in col:
    if test_df[i].dtype.name != 'object':
        X[i]=test_df[i].copy()
        X.loc[X[i].isna(), i]=X[i].median()
    else:
        X[i]=pd.factorize(test_df[i])[0]

# Предсказываем значения
model.predict(X)
test_probabilities = model.predict_proba(X)

test_probabilities

[0.2038, 'start_cluster']
[0.1023, 'balance_amt_avg']
[0.0967, 'sum_cred_e_oper_3m']
[0.086, 'ft_registration_date']
[0.0654, 'sum_of_paym_1y']
[0.0653, 'okved']
[0.0565, 'sum_of_paym_2m']
[0.0525, 'max_founderpres']
[0.0505, 'channel_code']
[0.0479, 'sum_cred_h_oper_3m']
[0.0471, 'ogrn_month']
[0.031, 'ogrn_year']
[0.0221, 'city_type']
[0.0216, 'sum_cred_g_oper_3m']
[0.0194, 'segment']
[0.0176, 'sum_c_oper_3m']
[0.0144, 'sum_a_oper_3m']
Accuracy: 0.7898333333333334


array([[0.34, 0.03, 0.01, ..., 0.  , 0.17, 0.  ],
       [0.29, 0.02, 0.01, ..., 0.  , 0.18, 0.  ],
       [0.21, 0.02, 0.03, ..., 0.  , 0.16, 0.  ],
       ...,
       [0.19, 0.03, 0.04, ..., 0.  , 0.27, 0.  ],
       [0.11, 0.03, 0.03, ..., 0.  , 0.25, 0.  ],
       [0.09, 0.04, 0.  , ..., 0.  , 0.26, 0.  ]])

In [13]:
train_df.start_cluster.unique()

NameError: name 'train_df' is not defined

In [12]:
# Создание DataFrame с вероятностями перехода в каждый кластер для каждой строчки
cluster_probabilities_df = pd.DataFrame(test_probabilities, columns=model.classes_)

# Добавление целевой переменной к DataFrame с вероятностями
# cluster_probabilities_df['end_cluster'] = y_test.values
cluster_probabilities_df['id'] = test_df['id']
cluster_probabilities_df = cluster_probabilities_df[[]]
# probabilities = model.predict_proba(X_test)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ},id
0,0.340000,0.030000,0.010000,0.430000,0.000000,0.0,0.0,0.000000,0.010000,0.010000,0.000000,0.0,0.00,0.0,0.00,0.170000,0.0,200000
1,0.290000,0.020000,0.010000,0.460000,0.000000,0.0,0.0,0.000000,0.020000,0.020000,0.000000,0.0,0.00,0.0,0.00,0.180000,0.0,200000
2,0.210000,0.020000,0.030000,0.550000,0.000000,0.0,0.0,0.000000,0.010000,0.020000,0.000000,0.0,0.00,0.0,0.00,0.160000,0.0,200000
3,0.251333,0.220000,0.010000,0.175333,0.000000,0.0,0.0,0.000000,0.010000,0.020000,0.020000,0.0,0.00,0.0,0.01,0.283333,0.0,200001
4,0.251333,0.230000,0.010000,0.175333,0.000000,0.0,0.0,0.000000,0.010000,0.020000,0.020000,0.0,0.00,0.0,0.01,0.273333,0.0,200001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290115,0.057744,0.262486,0.009193,0.036357,0.013477,0.0,0.0,0.004286,0.004659,0.074165,0.004745,0.0,0.00,0.0,0.00,0.532889,0.0,299998
290116,0.121818,0.110000,0.040000,0.325455,0.000000,0.0,0.0,0.000000,0.010000,0.010000,0.000000,0.0,0.01,0.0,0.00,0.372727,0.0,299998
290117,0.190000,0.030000,0.040000,0.460000,0.000000,0.0,0.0,0.000000,0.010000,0.000000,0.000000,0.0,0.00,0.0,0.00,0.270000,0.0,299999
290118,0.110000,0.030000,0.030000,0.570000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.010000,0.0,0.00,0.0,0.00,0.250000,0.0,299999


In [11]:
cluster_probabilities_df.to_csv('results/sample_submission.csv', encoding='utf-8', index=False)