# Классификация методом RandomForest

Этот пример призван проиллюстрировать возможности классификации методом RandomForest и оценки значимости критериев.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_parquet("Задание/train_data.pqt")
test_df = pd.read_parquet("Задание/test_data.pqt")

In [3]:
train_df.sample(5)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
256211,85403,month_3,-0.156107,-0.204418,-0.124766,-0.155642,channel_code_5,city_49,city_type_0,index_city_code_40,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.19171,0.251212,0.385529,{α},{α}
99462,33154,month_1,-0.155212,-0.203966,-0.125399,-0.154743,channel_code_4,city_35,city_type_0,index_city_code_32,...,,,-0.165588,,,-0.201123,,,{α},{α}
348821,116273,month_3,-0.156458,-0.204722,-0.125199,-0.155993,channel_code_8,city_16,city_type_0,index_city_code_26,...,,,-0.165588,,,-0.201123,,,{},{}
210844,70281,month_2,-0.156712,-0.204913,-0.125831,-0.156248,channel_code_9,city_114,city_type_0,,...,,,-0.165588,,,-0.201123,,,{},{}
437171,145723,month_3,-0.109141,-0.116772,-0.091355,-0.10853,channel_code_14,city_65,city_type_0,,...,0.944497,0.384773,-0.156434,0.547032,0.418798,-0.043728,0.252657,0.429485,{α},{α}


In [4]:
train_df.sample(5)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
124114,41371,month_2,,,,,channel_code_26,city_0,city_type_0,index_city_code_3,...,,,-0.165588,,,-0.201123,,,{},{}
558801,186267,month_1,-0.156411,-0.204687,-0.125084,-0.155947,channel_code_18,city_1608,city_type_0,index_city_code_136,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,"{α, ε}","{α, ε}"
349484,116494,month_3,,,,,channel_code_21,city_114,city_type_0,index_city_code_73,...,,,-0.165588,,,-0.201123,,,{},{}
556120,185373,month_2,-0.146654,-0.179566,-0.125464,-0.146159,channel_code_14,city_1526,city_type_1004,,...,0.944889,0.396267,-0.101665,0.55477,0.652131,-0.168612,0.256701,0.561353,{α},{α}
79683,26561,month_1,-0.126639,-0.166524,-0.123982,-0.126083,channel_code_8,city_0,city_type_0,index_city_code_20,...,0.94685,0.442244,-0.160631,0.548322,0.485465,-0.174919,0.252946,0.429485,{α},{α}


In [5]:
def write_to_file(data_list, accuracy):
    file_name = f'files/output_{accuracy.round(4)}.txt'
    
    with open(file_name, 'w') as file:
        for item in data_list:
            file.write(f'{item[0]}:{item[1]}\n')
        file.write(f'\nAccuracy: {accuracy}')

In [6]:
def generate_arrays(arr):
    for i in range(len(arr)):
        arr_without_one = arr[:i] + arr[i+1:]
        result = []
        for i in arr_without_one:
            result.append(i[1])
        yield result

    for pair in combinations(arr, 2):
        arr_without_two = [elem for elem in arr if elem not in pair]
        result = []
        for i in arr_without_two:
            result.append(i[1])
        yield result


def create_model(col):
    # код ниже преобразует категорийные данные в переменные и заполняет пропуски наиболее вероятным значением
    X=pd.DataFrame()
    for i in col:
        if train_df[i].dtype.name != 'object':
            X[i]=train_df[i].copy()
            X.loc[X[i].isna(), i]=X[i].median()
        else:
            X[i]=pd.factorize(train_df[i])[0]

    Y = train_df.apply(lambda row: 1 if row['start_cluster'] != row['end_cluster'] else 0, axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # создаем и тренируем модель, отдельно можно провести подбор параметров для повышения точности
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    probabilities = model.predict_proba(X_test)
    

    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    ar_f=[]
    for f, idx in enumerate(indices):
        ar_f.append([round(importances[idx],4), col[idx]])
    ar_f.sort(reverse=True)
    accuracy = metrics.accuracy_score(y_test, model.predict(X_test))    
    return ar_f, accuracy, probabilities
    

In [7]:
# ar_f, acc = create_model(['date', 'balance_amt_avg', 'channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'ft_registration_date', 'max_founderpres', 'okved', 'segment', 'sum_of_paym_2m', 'sum_of_paym_1y', 'sum_a_oper_3m', 'sum_c_oper_3m', 'sum_cred_e_oper_3m', 'sum_cred_g_oper_3m', 'sum_cred_h_oper_3m', 'start_cluster'])
ar_f, acc, probabilities = create_model(['start_cluster'])
write_to_file(ar_f, acc)
ar_f, acc

([[1.0, 'start_cluster']], 0.6737666666666666)

In [8]:
probabilities

array([[0.54315841, 0.45684159],
       [0.70830998, 0.29169002],
       [0.58288822, 0.41711178],
       ...,
       [0.70830998, 0.29169002],
       [0.70830998, 0.29169002],
       [0.70830998, 0.29169002]])

In [9]:
ar_f, acc

([[1.0, 'start_cluster']], 0.6737666666666666)