# Классификация методом RandomForest

Этот пример призван проиллюстрировать возможности классификации методом RandomForest и оценки значимости критериев.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_parquet("Задание/train_data.pqt")
test_df = pd.read_parquet("Задание/test_data.pqt")

In [5]:
train_df.head(10)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.43075,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}
5,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,,...,0.948027,0.488221,0.043221,0.560788,0.707687,-0.167905,0.259011,0.605309,{other},{other}
6,2,month_1,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
7,2,month_2,-0.152784,-0.193686,-0.122805,-0.152308,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.177854,0.252657,0.440474,{α},{α}
8,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.176302,0.252368,0.429485,{α},{α}
9,3,month_1,-0.156643,-0.204861,-0.12566,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}


In [4]:
train_df.sample(5)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
124114,41371,month_2,,,,,channel_code_26,city_0,city_type_0,index_city_code_3,...,,,-0.165588,,,-0.201123,,,{},{}
558801,186267,month_1,-0.156411,-0.204687,-0.125084,-0.155947,channel_code_18,city_1608,city_type_0,index_city_code_136,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,"{α, ε}","{α, ε}"
349484,116494,month_3,,,,,channel_code_21,city_114,city_type_0,index_city_code_73,...,,,-0.165588,,,-0.201123,,,{},{}
556120,185373,month_2,-0.146654,-0.179566,-0.125464,-0.146159,channel_code_14,city_1526,city_type_1004,,...,0.944889,0.396267,-0.101665,0.55477,0.652131,-0.168612,0.256701,0.561353,{α},{α}
79683,26561,month_1,-0.126639,-0.166524,-0.123982,-0.126083,channel_code_8,city_0,city_type_0,index_city_code_20,...,0.94685,0.442244,-0.160631,0.548322,0.485465,-0.174919,0.252946,0.429485,{α},{α}


In [5]:
def write_to_file(data_list, accuracy):
    file_name = f'files/output_{accuracy.round(4)}.txt'
    
    with open(file_name, 'w') as file:
        for item in data_list:
            file.write(f'{item[0]}:{item[1]}\n')
        file.write(f'\nAccuracy: {accuracy}')

In [6]:
def generate_arrays(arr):
    for i in range(len(arr)):
        arr_without_one = arr[:i] + arr[i+1:]
        result = []
        for i in arr_without_one:
            result.append(i[1])
        yield result

    for pair in combinations(arr, 2):
        arr_without_two = [elem for elem in arr if elem not in pair]
        result = []
        for i in arr_without_two:
            result.append(i[1])
        yield result


def create_model(col):
    # код ниже преобразует категорийные данные в переменные и заполняет пропуски наиболее вероятным значением
    X=pd.DataFrame()
    for i in col:
        if train_df[i].dtype.name != 'object':
            X[i]=train_df[i].copy()
            X.loc[X[i].isna(), i]=X[i].median()
        else:
            X[i]=pd.factorize(train_df[i])[0]

    Y = train_df.apply(lambda row: 1 if row['start_cluster'] != row['end_cluster'] else 0, axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # создаем и тренируем модель, отдельно можно провести подбор параметров для повышения точности
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    probabilities = model.predict_proba(X_test)
    

    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    ar_f=[]
    for f, idx in enumerate(indices):
        ar_f.append([round(importances[idx],4), col[idx]])
    ar_f.sort(reverse=True)
    accuracy = metrics.accuracy_score(y_test, model.predict(X_test))    
    return ar_f, accuracy, probabilities
    

In [7]:
# ar_f, acc = create_model(['date', 'balance_amt_avg', 'channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'ft_registration_date', 'max_founderpres', 'okved', 'segment', 'sum_of_paym_2m', 'sum_of_paym_1y', 'sum_a_oper_3m', 'sum_c_oper_3m', 'sum_cred_e_oper_3m', 'sum_cred_g_oper_3m', 'sum_cred_h_oper_3m', 'start_cluster'])
ar_f, acc, probabilities = create_model(['start_cluster'])
write_to_file(ar_f, acc)
ar_f, acc

([[1.0, 'start_cluster']], 0.6737666666666666)

In [8]:
probabilities

array([[0.54315841, 0.45684159],
       [0.70830998, 0.29169002],
       [0.58288822, 0.41711178],
       ...,
       [0.70830998, 0.29169002],
       [0.70830998, 0.29169002],
       [0.70830998, 0.29169002]])

In [9]:
ar_f, acc

([[1.0, 'start_cluster']], 0.6737666666666666)