In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Считываем данные из файла
df = pd.read_parquet("Задание/train_data.pqt")
# col = ['balance_amt_avg', 'channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'ft_registration_date', 'max_founderpres', 'okved', 'segment', 'sum_of_paym_2m', 'sum_of_paym_1y', 'sum_a_oper_3m', 'sum_c_oper_3m', 'sum_cred_e_oper_3m', 'sum_cred_g_oper_3m', 'sum_cred_h_oper_3m', 'start_cluster']
col = ['start_cluster']
X = pd.DataFrame()
for i in col:
    if df[i].dtype.name != 'object':
        X[i]=df[i].copy()
        X.loc[X[i].isna(), i]=X[i].median()
    else:
        X[i]=pd.factorize(df[i])[0]
# Разделяем данные на признаки (X) и целевую переменную (y)
y = df['end_cluster']  # целевая переменная

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем модель случайного леса
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Обучаем модель на обучающих данных
model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)

# Создание DataFrame с вероятностями перехода в каждый кластер для каждой строчки
cluster_probabilities_df = pd.DataFrame(probabilities, columns=model.classes_)

# Добавление целевой переменной к DataFrame с вероятностями
cluster_probabilities_df['end_cluster'] = y_test.values

cluster_probabilities_df

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ},end_cluster
0,0.031518,0.543116,0.006361,0.016867,0.001667,0.001221,0.000492,0.000077,0.003941,0.040917,0.005079,0.000491,0.001965,0.000000,0.001604,0.344486,0.000200,{}
1,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127,{α}
2,0.177704,0.031348,0.009159,0.006803,0.002805,0.000250,0.037276,0.000000,0.007534,0.004182,0.581317,0.002656,0.000264,0.000000,0.002276,0.136426,0.000000,"{α, θ}"
3,0.680016,0.009117,0.017573,0.123508,0.008177,0.002617,0.004342,0.002347,0.013115,0.030508,0.023167,0.002155,0.005410,0.000000,0.012411,0.065251,0.000285,{other}
4,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127,{α}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127,{α}
119996,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127,{α}
119997,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127,{α}
119998,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127,{α}


In [21]:
cluster_probabilities_df.to_excel('result.xlsx')

Считаем точность модели и значимость каждого столбца

In [12]:
importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
ar_f=[]
for f, idx in enumerate(indices):
    ar_f.append([round(importances[idx],4), col[idx]])
print("Значимость признака:")
ar_f.sort(reverse=True)
print(ar_f)

# Предсказываем значения на тестовой выборке
y_pred = random_forest.predict(X_test)

# Оцениваем качество модели
accuracy = accuracy_score(y_test, y_pred)
"Accuracy:", accuracy

Значимость признака:
[[1.0, 'start_cluster']]


('Accuracy:', 0.6679916666666667)

In [7]:
test_df = pd.read_parquet("Задание/test_data.pqt")
test_df['start_cluster'].unique()

array(['{α}', None, '{other}', '{}', '{α, μ}', '{α, γ}', '{α, β}',
       '{α, ε, η}', '{α, η}', '{α, δ}', '{α, ε}', '{α, ψ}', '{α, θ}',
       '{α, λ}', '{α, ε, θ}', '{λ}', '{α, ε, ψ}', '{α, π}'], dtype=object)

In [10]:
X = pd.DataFrame()
for i in col:
    if test_df[i].dtype.name != 'object':
        X[i]=test_df[i].copy()
        X.loc[X[i].isna(), i]=X[i].median()
    else:
        X[i]=pd.factorize(test_df[i])[0]

random_forest.predict(X)
# probabilities = model.predict_proba(X_test)

array(['{α, γ}', '{α, γ}', '{α, γ}', ..., '{α, γ}', '{α, γ}', '{α, γ}'],
      dtype=object)

In [11]:
df = pd.DataFrame(probabilities, columns=['{α}', '{other}', '{}', '{α, μ}', '{α, γ}', '{α, β}',
       '{α, ε, η}', '{α, η}', '{α, δ}', '{α, ε}', '{α, ψ}', '{α, θ}',
       '{α, λ}', '{α, ε, θ}', '{λ}', '{α, ε, ψ}', '{α, π}'], index=range(1, len(df)+1))
df

Unnamed: 0,{α},{other},{},"{α, μ}","{α, γ}","{α, β}","{α, ε, η}","{α, η}","{α, δ}","{α, ε}","{α, ψ}","{α, θ}","{α, λ}","{α, ε, θ}",{λ},"{α, ε, ψ}","{α, π}"
1,0.031518,0.543116,0.006361,0.016867,0.001667,0.001221,0.000492,0.000077,0.003941,0.040917,0.005079,0.000491,0.001965,0.000000,0.001604,0.344486,0.000200
2,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127
3,0.177704,0.031348,0.009159,0.006803,0.002805,0.000250,0.037276,0.000000,0.007534,0.004182,0.581317,0.002656,0.000264,0.000000,0.002276,0.136426,0.000000
4,0.680016,0.009117,0.017573,0.123508,0.008177,0.002617,0.004342,0.002347,0.013115,0.030508,0.023167,0.002155,0.005410,0.000000,0.012411,0.065251,0.000285
5,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119996,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127
119997,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127
119998,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127
119999,0.019280,0.202818,0.012692,0.020549,0.004486,0.000425,0.000721,0.000103,0.005233,0.014770,0.005003,0.001247,0.002658,0.000032,0.001637,0.708224,0.000127
