# Определения уровня навыков игрока на основе анализа результатов бизнес-симуляции

## Московская область. Региональный чемпионат

### 27 сентября — 21 октября 2022

## Задача

### Условие задачи

Разработать и обучить модель машинного обучения на основе метаданных поведения пользователя в игре (лог принятых решений
пользователей в играх) и его итоговой экспертной оценки. В будущем это позволит проставлять оценки участникам симуляции без привлечения экспертов.


### Описание входных значений

    user.csv — таблица, где хранится информация об участнике
    ● user_id — уникальный номер участника
    ● game_id — уникальный номер игры, в которой принимали участие
    ● team_id — уникальный номер команды участника
    user decision.csv — таблица с действиями игрока за время игры
    ● user_id — уникальный номер участника
    ● period_id — номер периода игры (стадии)
    ● decision_id — номер действия, которое совершил участник
    team_point.csv — таблица с баллами команд
    ● team_id — уникальный номер команды
    ● category_id — номер категории, с точки зрении которой оцениваются команды
    ● period_id — номер периода игры (стадии)
    ● score — баллы команды
    ● place — рейтинговый номер команды в конкретном периоде
    decision — таблица с расшифровкой действий участников
    ● decision_id — уникальный номер действия
    ● decision_name — имя действия
    train.csv — файл, содержащий данные игроков для тренировки
    submission.csv — пример файла для отправки, содержит id игроков, для которых требуется предсказать значения


### Метрика
В качестве метрики выступает сумма Recall по четырем навыкам.

## Загрузка библиотек и данных

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score

%matplotlib inline

In [2]:
df_train = pd.read_csv("train.csv")
df_user = pd.read_csv("user.csv")
df_user_dec = pd.read_csv("user_decision.csv")
df_dec = pd.read_excel("decision.xlsx")
df_point = pd.read_csv("team_point.csv")
df_submission = pd.read_csv("sample_solution.csv")

## Исследование данных

In [3]:
# EDA

def eda(df):
    """ функция выводит основные характеристики датафрейма """
    print(df.head())
    print('\n info()')
    print(df.info())
    print('\n describe()')
    print(df.describe())
    print('\n nunique()')
    print(df.nunique())

def EDA(data):

    shape_info = data.shape

    print(f'\nКоличество строк: {shape_info[0]} \nКоличество столбцов:  {shape_info[1]}\n',
    '\n------------------------------------------------------------------------------------\n')

    print(f'Названия столбцов: {list(data.columns)}\n',
    '\n------------------------------------------------------------------------------------\n')

    print(f'Типы данных:\n')
    print(data.dtypes,
    '\n\n------------------------------------------------------------------------------------\n')

    print('Статистика для численных данных:\n')
    print(data.describe(),
    '\n\n------------------------------------------------------------------------------------\n')
    
    print('Количество уникальных значений для категориальных данных:\n')
    print(data[data.columns[(data.dtypes == object)]].apply(lambda x: x.nunique()),
    '\n\n------------------------------------------------------------------------------------\n')
    
    print('Пропущенные значения:\n')
    print(round(100 * (data.isnull().sum() / data.shape[0]), 2).apply(lambda x: str(x)+'%'),
    '\n\n------------------------------------------------------------------------------------\n')

In [4]:
for i in [df_user, df_user_dec, df_dec, df_point, df_train, df_submission]:
    eda(i)
    print('===============================================')
    print()

   user_id  team_id  game_id
0    10189     1664      235
1    10018     1690      237
2    10580     1394      194
3    10600     1908      262
4    10112     1592      219

 info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   user_id  973 non-null    int64
 1   team_id  973 non-null    int64
 2   game_id  973 non-null    int64
dtypes: int64(3)
memory usage: 22.9 KB
None

 describe()
            user_id      team_id     game_id
count    973.000000   973.000000  973.000000
mean   10487.000000  1616.429599  224.618705
std      281.025206   179.501299   22.920859
min    10001.000000  1248.000000  182.000000
25%    10244.000000  1474.000000  206.000000
50%    10487.000000  1654.000000  234.000000
75%    10730.000000  1758.000000  243.000000
max    10973.000000  1926.000000  264.000000

 nunique()
user_id    973
team_id    214
game_id     43
dtype: int64



In [5]:
#EDA(df_point)

In [6]:
#df_point[df_point['team_id'] == 1950]

In [7]:
#len(set(df_user_dec.user_id) ^ set(df_user.user_id))

In [8]:
#df_user_dec.user_id.sort_values(), df_user.user_id.sort_values()

In [9]:
#display(df_user, df_user_dec, df_dec, df_point, df_train, df_submission)

## Подготовка данных

### Предобработка df_dec

In [10]:
df_dec[pd.notnull(df_dec['Unnamed: 2'])]

Unnamed: 0,decision_id,decision_name,Unnamed: 2
309,310,Implementation of ICS &quot,"\n311"",""Opening of first-aid station"""


In [11]:
df_dec.iloc[309]

decision_id                                        310
decision_name              Implementation of ICS &quot
Unnamed: 2       \n311","Opening of first-aid station"
Name: 309, dtype: object

In [12]:
new_row = {'decision_id':311, 'decision_name':'Opening of first-aid station', 'Unnamed: 2':'NaN'}
df_dec.loc[len(df_dec)] = new_row

In [13]:
new_row = {'decision_id':310, 'decision_name':'Implementation of ICS', 'Unnamed: 2':np.nan}
df_dec.loc[309] = new_row

In [14]:
df_dec = df_dec.drop(columns= ['Unnamed: 2'])
df_dec

Unnamed: 0,decision_id,decision_name
0,1,Dividend payout in cash
1,2,Dividend payout in stocks
2,3,Zero dividend policy
3,4,The distribution of the residue after the rein...
4,5,Fixed size of dividend payouts sum
...,...,...
433,435,The volume of of transportation - National com...
434,436,The volume of of transportation - Worldwide co...
435,437,Liquidation equipment
436,438,Redeem production room


In [15]:
df_user_dec[df_user_dec['decision_id'] == 311]

Unnamed: 0,user_id,period,decision_id
306,10948,1,311
325,10929,1,311
513,10208,2,311
961,10583,3,311
992,10609,2,311
...,...,...,...
58239,10735,4,311
58271,10423,3,311
58342,10382,4,311
59002,10665,2,311


In [16]:
lst_dec = df_dec['decision_name'].tolist()
lst_dec

['Dividend payout in cash',
 'Dividend payout in stocks',
 'Zero dividend policy',
 'The distribution of the residue after the reinvestment',
 'Fixed size of dividend payouts sum',
 'Minimal fixed + % by the results of a period',
 'Fixed % from net profit',
 'Policty of constant growth of dividend payoutsвЂ™s value',
 'Policy of constant growth of dividend payoutsвЂ™ value (Growth of payouts) (buttons with percentage)',
 'Bicycles Production line 10000 units',
 'Bicycles Production line 20000 units',
 'Bicycles Production line 40000 units',
 'Quadrocopters Production line 5000 units',
 'Quadrocopters Production line 10000 units',
 'Quadrocopters Production line 20000 units',
 'Hoverbikes Production line 1250 units',
 'Hoverbikes Production line 2500 units',
 'Hoverbikes Production line 5000 units',
 'Equipment leasing',
 'The lease term. half-year',
 'Start of a project',
 'Venture fund investments in exchange for patents',
 'Fundraising and kickstarter',
 'Pre-orders',
 'Create a team

### df_user_dec . period > sum_user_dec

In [17]:
df_user_dec

Unnamed: 0,user_id,period,decision_id
0,10625,1,409
1,10318,3,203
2,10775,4,420
3,10236,2,284
4,10130,3,72
...,...,...,...
60223,10312,3,329
60224,10189,1,33
60225,10346,3,123
60226,10424,2,140


In [18]:
p_count = df_user_dec.groupby(['user_id', 'period']).count()
p_count.columns

Index(['decision_id'], dtype='object')

In [19]:
gb_user_dec = pd.get_dummies(df_user_dec, columns=["period"])
gb_user_dec

Unnamed: 0,user_id,decision_id,period_1,period_2,period_3,period_4
0,10625,409,1,0,0,0
1,10318,203,0,0,1,0
2,10775,420,0,0,0,1
3,10236,284,0,1,0,0
4,10130,72,0,0,1,0
...,...,...,...,...,...,...
60223,10312,329,0,0,1,0
60224,10189,33,1,0,0,0
60225,10346,123,0,0,1,0
60226,10424,140,0,1,0,0


In [20]:
gb_user_dec[gb_user_dec['user_id'] == 10625]

Unnamed: 0,user_id,decision_id,period_1,period_2,period_3,period_4
0,10625,409,1,0,0,0
33,10625,67,0,0,1,0
162,10625,148,0,1,0,0
219,10625,176,1,0,0,0
228,10625,146,0,1,0,0
...,...,...,...,...,...,...
58880,10625,408,1,0,0,0
58907,10625,127,0,0,1,0
58936,10625,407,0,0,1,0
59693,10625,109,0,1,0,0


In [21]:
#gb_user_dec.groupby(['user_id', 'period_1', 'period_2', 'period_3', 'period_4']).count()
sum_user_dec = gb_user_dec.groupby(['user_id']).sum()
sum_user_dec

Unnamed: 0_level_0,decision_id,period_1,period_2,period_3,period_4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10001,13677,29,23,39,15
10002,11931,0,9,25,22
10003,34362,21,41,60,84
10004,18534,3,19,41,27
10005,12423,14,34,31,0
...,...,...,...,...,...
10969,3578,0,1,5,8
10970,20701,11,22,32,0
10971,19457,22,26,12,25
10972,1713,0,0,0,4


In [22]:
# sum_user_dec['period_sum'] = sum_user_dec['period_1'] + sum_user_dec['period_2'] + sum_user_dec['period_3'] + sum_user_dec['period_4']
# sum_user_dec['period_12'] = sum_user_dec['period_1'] + sum_user_dec['period_2']
# sum_user_dec['period_23'] = sum_user_dec['period_2'] + sum_user_dec['period_3']
# sum_user_dec['period_34'] = sum_user_dec['period_3'] + sum_user_dec['period_4']
# sum_user_dec['period_41'] = sum_user_dec['period_4'] + sum_user_dec['period_1']
# sum_user_dec

In [23]:
#sum_user_dec

### df_user_dec . decision_id > sumdec_user_dec

In [24]:
d_count = df_user_dec.groupby(['user_id', 'decision_id']).count()
d_count

Unnamed: 0_level_0,Unnamed: 1_level_0,period
user_id,decision_id,Unnamed: 2_level_1
10001,1,1
10001,4,1
10001,6,1
10001,11,2
10001,12,2
...,...,...
10973,304,1
10973,306,1
10973,344,1
10973,386,1


In [25]:
gbdec_user_dec = pd.get_dummies(df_user_dec, columns=["decision_id"])
gbdec_user_dec

Unnamed: 0,user_id,period,decision_id_1,decision_id_3,decision_id_4,decision_id_5,decision_id_6,decision_id_7,decision_id_8,decision_id_9,...,decision_id_427,decision_id_428,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437
0,10625,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10318,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10775,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10236,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10130,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60223,10312,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60224,10189,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60225,10346,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60226,10424,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
sumdec_user_dec = gbdec_user_dec.groupby(['user_id']).sum()
sumdec_user_dec

Unnamed: 0_level_0,period,decision_id_1,decision_id_3,decision_id_4,decision_id_5,decision_id_6,decision_id_7,decision_id_8,decision_id_9,decision_id_10,...,decision_id_427,decision_id_428,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,252,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,181,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,619,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
10004,272,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
10005,175,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10969,49,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10970,151,0,0,0,0,0,0,0,0,0,...,3,1,1,1,2,1,1,0,3,0
10971,210,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,0,0,0,0
10972,16,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0


### train

In [27]:
df_train

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus
0,10884,4.0,4.0,4.0,4.0
1,10106,5.0,4.0,5.0,5.0
2,10438,6.0,5.0,5.0,5.0
3,10130,5.0,4.0,5.0,5.0
4,10667,4.0,5.0,5.0,5.0
...,...,...,...,...,...
675,10563,5.0,4.0,5.0,4.0
676,10386,5.0,4.0,5.0,5.0
677,10675,5.0,4.0,5.0,4.0
678,10366,4.0,4.0,4.0,4.0


In [28]:
sum_user_dec

Unnamed: 0_level_0,decision_id,period_1,period_2,period_3,period_4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10001,13677,29,23,39,15
10002,11931,0,9,25,22
10003,34362,21,41,60,84
10004,18534,3,19,41,27
10005,12423,14,34,31,0
...,...,...,...,...,...
10969,3578,0,1,5,8
10970,20701,11,22,32,0
10971,19457,22,26,12,25
10972,1713,0,0,0,4


In [29]:
# Объединение df
train = pd.merge(df_train, sum_user_dec, how='left', left_on='id', right_on='user_id')
train = train.drop(columns= ['decision_id'])
train.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4
0,10884,4.0,4.0,4.0,4.0,7,14,28,10
1,10106,5.0,4.0,5.0,5.0,21,21,12,8
2,10438,6.0,5.0,5.0,5.0,62,45,63,38
3,10130,5.0,4.0,5.0,5.0,26,28,34,66
4,10667,4.0,5.0,5.0,5.0,58,70,194,35
5,10908,4.0,4.0,5.0,5.0,0,4,10,7
6,10378,4.0,5.0,5.0,4.0,3,1,0,0
7,10220,5.0,4.0,5.0,5.0,1,0,2,1
8,10612,5.0,4.0,4.0,5.0,26,22,12,15
9,10776,4.0,4.0,5.0,5.0,16,13,37,36


In [30]:
sumdec_user_dec

Unnamed: 0_level_0,period,decision_id_1,decision_id_3,decision_id_4,decision_id_5,decision_id_6,decision_id_7,decision_id_8,decision_id_9,decision_id_10,...,decision_id_427,decision_id_428,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,252,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,181,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,619,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
10004,272,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
10005,175,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10969,49,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10970,151,0,0,0,0,0,0,0,0,0,...,3,1,1,1,2,1,1,0,3,0
10971,210,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,0,0,0,0
10972,16,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0


In [31]:
sumdec_user_dec = sumdec_user_dec.drop(columns= ['period'])
train = pd.merge(train, sumdec_user_dec, how='left', left_on='id', right_on='user_id')
train.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_427,decision_id_428,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437
0,10884,4.0,4.0,4.0,4.0,7,14,28,10,0,...,2,0,0,0,0,0,0,0,0,0
1,10106,5.0,4.0,5.0,5.0,21,21,12,8,0,...,0,0,0,0,0,0,0,0,0,0
2,10438,6.0,5.0,5.0,5.0,62,45,63,38,0,...,2,0,0,0,0,0,0,0,0,0
3,10130,5.0,4.0,5.0,5.0,26,28,34,66,0,...,0,1,0,0,0,0,0,0,2,0
4,10667,4.0,5.0,5.0,5.0,58,70,194,35,1,...,2,0,1,0,1,1,1,1,1,0
5,10908,4.0,4.0,5.0,5.0,0,4,10,7,0,...,0,0,0,0,0,0,0,0,0,0
6,10378,4.0,5.0,5.0,4.0,3,1,0,0,1,...,1,1,0,0,0,0,0,0,0,0
7,10220,5.0,4.0,5.0,5.0,1,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
8,10612,5.0,4.0,4.0,5.0,26,22,12,15,0,...,0,1,1,2,0,0,0,0,0,0
9,10776,4.0,4.0,5.0,5.0,16,13,37,36,0,...,0,0,0,0,0,0,0,0,0,0


### df_user

In [32]:
df_user

Unnamed: 0,user_id,team_id,game_id
0,10189,1664,235
1,10018,1690,237
2,10580,1394,194
3,10600,1908,262
4,10112,1592,219
...,...,...,...
968,10205,1826,249
969,10325,1628,231
970,10527,1730,241
971,10371,1704,238


In [33]:
# Объединение df
train = pd.merge(train, df_user, how='left', left_on='id', right_on='user_id')
train = train.drop(columns= ['user_id'])
train.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437,team_id,game_id
0,10884,4.0,4.0,4.0,4.0,7,14,28,10,0,...,0,0,0,0,0,0,0,0,1440,202
1,10106,5.0,4.0,5.0,5.0,21,21,12,8,0,...,0,0,0,0,0,0,0,0,1536,211
2,10438,6.0,5.0,5.0,5.0,62,45,63,38,0,...,0,0,0,0,0,0,0,0,1742,242
3,10130,5.0,4.0,5.0,5.0,26,28,34,66,0,...,0,0,0,0,0,0,2,0,1272,184
4,10667,4.0,5.0,5.0,5.0,58,70,194,35,1,...,1,0,1,1,1,1,1,0,1382,193
5,10908,4.0,4.0,5.0,5.0,0,4,10,7,0,...,0,0,0,0,0,0,0,0,1750,243
6,10378,4.0,5.0,5.0,4.0,3,1,0,0,1,...,0,0,0,0,0,0,0,0,1468,205
7,10220,5.0,4.0,5.0,5.0,1,0,2,1,0,...,0,0,0,0,0,0,0,0,1668,235
8,10612,5.0,4.0,4.0,5.0,26,22,12,15,0,...,1,2,0,0,0,0,0,0,1346,190
9,10776,4.0,4.0,5.0,5.0,16,13,37,36,0,...,0,0,0,0,0,0,0,0,1688,237


### df_point . period > point_per

In [34]:
df_point = pd.read_csv("team_point.csv")
df_point.head(3)

Unnamed: 0,team_id,category_id,period,score,place
0,1948,2,0,16,2
1,1934,3,4,45,6
2,1894,4,8,24,4


In [35]:
df_point

Unnamed: 0,team_id,category_id,period,score,place
0,1948,2,0,16,2
1,1934,3,4,45,6
2,1894,4,8,24,4
3,1688,4,0,27,4
4,1592,4,2,21,5
...,...,...,...,...,...
10981,1312,4,1,32,4
10982,1974,1,0,4,6
10983,1822,1,4,31,1
10984,1390,2,0,16,2


In [36]:
#df_point[df_point['team_id'] == 1414].sort_values(['category_id','period'], ascending=True)

In [37]:
# берем только 4 первых периода
df_point_4 = df_point[df_point['period'] < 4]
df_point_4 

Unnamed: 0,team_id,category_id,period,score,place
0,1948,2,0,16,2
3,1688,4,0,27,4
4,1592,4,2,21,5
5,1344,5,0,8,6
6,1982,6,1,146,3
...,...,...,...,...,...
10980,1236,5,1,15,1
10981,1312,4,1,32,4
10982,1974,1,0,4,6
10984,1390,2,0,16,2


In [38]:
#!!! 
point_per = df_point_4.groupby(["team_id", "period"]).agg({
  "score": "sum"
}).reset_index()
point_per

Unnamed: 0,team_id,period,score
0,381,0,316
1,381,1,264
2,381,2,220
3,381,3,26
4,382,0,316
...,...,...,...
1415,1988,3,54
1416,1990,0,316
1417,1990,1,270
1418,1990,2,332


In [39]:
point_per = point_per.rename(columns={'period': 'period_team'})
point_per

Unnamed: 0,team_id,period_team,score
0,381,0,316
1,381,1,264
2,381,2,220
3,381,3,26
4,382,0,316
...,...,...,...
1415,1988,3,54
1416,1990,0,316
1417,1990,1,270
1418,1990,2,332


In [40]:
point_per = pd.get_dummies(point_per, columns=["period_team"])
point_per

Unnamed: 0,team_id,score,period_team_0,period_team_1,period_team_2,period_team_3
0,381,316,1,0,0,0
1,381,264,0,1,0,0
2,381,220,0,0,1,0
3,381,26,0,0,0,1
4,382,316,1,0,0,0
...,...,...,...,...,...,...
1415,1988,54,0,0,0,1
1416,1990,316,1,0,0,0
1417,1990,270,0,1,0,0
1418,1990,332,0,0,1,0


In [41]:
point_per.period_team_0 = point_per.period_team_0 * point_per.score
point_per.period_team_1 = point_per.period_team_1 * point_per.score
point_per.period_team_2 = point_per.period_team_2 * point_per.score
point_per.period_team_3 = point_per.period_team_3 * point_per.score
point_per

Unnamed: 0,team_id,score,period_team_0,period_team_1,period_team_2,period_team_3
0,381,316,316,0,0,0
1,381,264,0,264,0,0
2,381,220,0,0,220,0
3,381,26,0,0,0,26
4,382,316,316,0,0,0
...,...,...,...,...,...,...
1415,1988,54,0,0,0,54
1416,1990,316,316,0,0,0
1417,1990,270,0,270,0,0
1418,1990,332,0,0,332,0


In [42]:
point_per = point_per.groupby(["team_id"]).agg({
  "period_team_0": "sum", "period_team_1": "sum", "period_team_2": "sum", "period_team_3": "sum"
}).reset_index()
point_per

Unnamed: 0,team_id,period_team_0,period_team_1,period_team_2,period_team_3
0,381,316,264,220,26
1,382,316,264,332,178
2,383,316,230,378,392
3,384,316,404,246,154
4,1232,316,232,74,16
...,...,...,...,...,...
350,1982,316,292,464,76
351,1984,316,342,580,248
352,1986,316,262,176,170
353,1988,316,210,260,54


In [43]:
point_per.describe()

Unnamed: 0,team_id,period_team_0,period_team_1,period_team_2,period_team_3
count,355.0,355.0,355.0,355.0,355.0
mean,1602.957746,316.0,300.619718,324.507042,303.250704
std,256.207699,0.0,111.97467,144.582125,180.700307
min,381.0,316.0,-36.0,-30.0,-36.0
25%,1421.0,316.0,226.0,237.0,154.0
50%,1620.0,316.0,282.0,316.0,290.0
75%,1809.0,316.0,351.0,415.0,426.0
max,1990.0,316.0,600.0,680.0,794.0


### df_point . category_id > point_cat

In [44]:
df_point['place'].max()

6

In [45]:
df_point['place'] = df_point['place'].max() - df_point['place'] + 1
df_point.head(3)

Unnamed: 0,team_id,category_id,period,score,place
0,1948,2,0,16,5
1,1934,3,4,45,1
2,1894,4,8,24,3


In [46]:
#df_point[df_point['team_id'] == 1414].sort_values(['category_id','period'], ascending=True)

In [47]:
point_sum2 = df_point.groupby(["team_id", "category_id"]).agg({
  "score": "sum", "period": "count"
}).reset_index()
point_sum2

Unnamed: 0,team_id,category_id,score,period
0,381,1,-31,5
1,381,2,-49,5
2,381,3,320,5
3,381,4,116,5
4,381,5,112,5
...,...,...,...,...
2125,1990,2,38,5
2126,1990,3,339,5
2127,1990,4,133,5
2128,1990,5,156,5


In [48]:
#!!!
point_sum = df_point.groupby(["team_id", "category_id"]).agg({
  "score": "sum"
}).reset_index()
point_sum

Unnamed: 0,team_id,category_id,score
0,381,1,-31
1,381,2,-49
2,381,3,320
3,381,4,116
4,381,5,112
...,...,...,...
2125,1990,2,38
2126,1990,3,339
2127,1990,4,133
2128,1990,5,156


In [49]:
point_cat = pd.get_dummies(point_sum, columns=["category_id"])
point_cat

Unnamed: 0,team_id,score,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6
0,381,-31,1,0,0,0,0,0
1,381,-49,0,1,0,0,0,0
2,381,320,0,0,1,0,0,0
3,381,116,0,0,0,1,0,0
4,381,112,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
2125,1990,38,0,1,0,0,0,0
2126,1990,339,0,0,1,0,0,0
2127,1990,133,0,0,0,1,0,0
2128,1990,156,0,0,0,0,1,0


In [50]:
point_cat.category_id_1 = point_cat.category_id_1 * point_cat.score
point_cat.category_id_2 = point_cat.category_id_2 * point_cat.score
point_cat.category_id_3 = point_cat.category_id_3 * point_cat.score
point_cat.category_id_4 = point_cat.category_id_4 * point_cat.score
point_cat.category_id_5 = point_cat.category_id_5 * point_cat.score
point_cat.category_id_6 = point_cat.category_id_6 * point_cat.score
point_cat

Unnamed: 0,team_id,score,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6
0,381,-31,-31,0,0,0,0,0
1,381,-49,0,-49,0,0,0,0
2,381,320,0,0,320,0,0,0
3,381,116,0,0,0,116,0,0
4,381,112,0,0,0,0,112,0
...,...,...,...,...,...,...,...,...
2125,1990,38,0,38,0,0,0,0
2126,1990,339,0,0,339,0,0,0
2127,1990,133,0,0,0,133,0,0
2128,1990,156,0,0,0,0,156,0


In [51]:
point_cat2 = point_cat.groupby(["team_id"]).agg({
  "category_id_1": "sum", "category_id_2": "sum", "category_id_3": "sum", "category_id_4": "sum", "category_id_5": "sum", "category_id_6": "sum"
}).reset_index()
point_cat2

Unnamed: 0,team_id,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6
0,381,-31,-49,320,116,112,468
1,382,-3,28,347,141,115,628
2,383,28,138,363,155,93,777
3,384,-8,5,371,128,127,623
4,1232,-75,-98,319,98,145,389
...,...,...,...,...,...,...,...
350,1982,-1,-21,417,132,107,634
351,1984,91,173,404,156,116,940
352,1986,-10,-20,383,150,114,617
353,1988,-33,-49,332,128,111,489


In [52]:
train

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437,team_id,game_id
0,10884,4.0,4.0,4.0,4.0,7,14,28,10,0,...,0,0,0,0,0,0,0,0,1440,202
1,10106,5.0,4.0,5.0,5.0,21,21,12,8,0,...,0,0,0,0,0,0,0,0,1536,211
2,10438,6.0,5.0,5.0,5.0,62,45,63,38,0,...,0,0,0,0,0,0,0,0,1742,242
3,10130,5.0,4.0,5.0,5.0,26,28,34,66,0,...,0,0,0,0,0,0,2,0,1272,184
4,10667,4.0,5.0,5.0,5.0,58,70,194,35,1,...,1,0,1,1,1,1,1,0,1382,193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,10563,5.0,4.0,5.0,4.0,10,2,14,4,0,...,0,0,0,0,0,0,0,0,1274,184
676,10386,5.0,4.0,5.0,5.0,1,14,24,1,0,...,0,0,0,0,0,0,0,0,1526,210
677,10675,5.0,4.0,5.0,4.0,4,17,23,12,0,...,3,0,1,2,4,3,0,0,1670,235
678,10366,4.0,4.0,4.0,4.0,10,44,45,38,1,...,0,0,0,0,0,0,0,0,1808,248


In [53]:
# Объединение с point_cat2
train = pd.merge(train, point_cat2, how='left', left_on='team_id', right_on='team_id')
#train = train.drop(columns= ['user_id'])
train.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_436,decision_id_437,team_id,game_id,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6
0,10884,4.0,4.0,4.0,4.0,7,14,28,10,0,...,0,0,1440,202,-5,-25,391,128,105,594
1,10106,5.0,4.0,5.0,5.0,21,21,12,8,0,...,0,0,1536,211,54,120,467,160,113,914
2,10438,6.0,5.0,5.0,5.0,62,45,63,38,0,...,0,0,1742,242,55,60,383,158,146,802
3,10130,5.0,4.0,5.0,5.0,26,28,34,66,0,...,2,0,1272,184,31,24,366,139,107,667
4,10667,4.0,5.0,5.0,5.0,58,70,194,35,1,...,1,0,1382,193,71,70,416,166,155,878
5,10908,4.0,4.0,5.0,5.0,0,4,10,7,0,...,0,0,1750,243,26,68,477,150,70,791
6,10378,4.0,5.0,5.0,4.0,3,1,0,0,1,...,0,0,1468,205,81,156,498,167,82,984
7,10220,5.0,4.0,5.0,5.0,1,0,2,1,0,...,0,0,1668,235,25,80,440,139,65,749
8,10612,5.0,4.0,4.0,5.0,26,22,12,15,0,...,0,0,1346,190,75,156,504,165,70,970
9,10776,4.0,4.0,5.0,5.0,16,13,37,36,0,...,0,0,1688,237,0,-5,334,119,73,521


In [54]:
# Объединение с point_per
train = pd.merge(train, point_per, how='left', left_on='team_id', right_on='team_id')
#train = train.drop(columns= ['user_id'])
train.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,period_team_0,period_team_1,period_team_2,period_team_3
0,10884,4.0,4.0,4.0,4.0,7,14,28,10,0,...,-5,-25,391,128,105,594,316,284,166,120
1,10106,5.0,4.0,5.0,5.0,21,21,12,8,0,...,54,120,467,160,113,914,316,338,468,398
2,10438,6.0,5.0,5.0,5.0,62,45,63,38,0,...,55,60,383,158,146,802,316,184,598,340
3,10130,5.0,4.0,5.0,5.0,26,28,34,66,0,...,31,24,366,139,107,667,316,164,160,300
4,10667,4.0,5.0,5.0,5.0,58,70,194,35,1,...,71,70,416,166,155,878,316,558,576,150
5,10908,4.0,4.0,5.0,5.0,0,4,10,7,0,...,26,68,477,150,70,791,316,320,370,230
6,10378,4.0,5.0,5.0,4.0,3,1,0,0,1,...,81,156,498,167,82,984,316,408,532,360
7,10220,5.0,4.0,5.0,5.0,1,0,2,1,0,...,25,80,440,139,65,749,316,228,324,340
8,10612,5.0,4.0,4.0,5.0,26,22,12,15,0,...,75,156,504,165,70,970,316,420,478,432
9,10776,4.0,4.0,5.0,5.0,16,13,37,36,0,...,0,-5,334,119,73,521,316,216,258,128


In [55]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,680.0,10486.089706,281.738218,10001.0,10233.75,10491.5,10729.25,10973.0
Analytical thinking,680.0,4.277941,0.647266,1.0,4.00,4.0,5.00,6.0
Systemic thinking,680.0,4.119118,0.596197,1.0,4.00,4.0,4.00,6.0
Adaptability,680.0,4.447059,0.664804,1.0,4.00,4.0,5.00,6.0
Focus,680.0,4.308824,0.608176,1.0,4.00,4.0,5.00,6.0
...,...,...,...,...,...,...,...,...
category_id_6,680.0,790.436765,223.485989,258.0,636.00,791.5,914.00,1520.0
period_team_0,680.0,316.000000,0.000000,316.0,316.00,316.0,316.00,316.0
period_team_1,680.0,303.488235,120.746072,-10.0,227.50,284.0,358.00,596.0
period_team_2,680.0,333.641176,146.344483,-30.0,246.00,336.0,422.00,676.0


In [56]:
#df_user_dec.head(2), point_sum.head(2)

In [57]:
# df_user_train = df_user_dec[df_user_dec.user_id.map(lambda x:x in df_train.id.values)]
# df_decision_count = df_user_train.groupby(by=["user_id",	"period"]).count()
# df_user_train

In [58]:
# df_train_per = df_train.copy()
# for id in df_train_per.id.values:
#     for period in range(1,5):
#         try :
#             index = df_train[df_train.id == id].index[0]
#             df_train_per.loc[index,"period_"+str(period)] = df_decision_count.loc[pd.IndexSlice[id, period:period], :].values[0][0]
#         except IndexError:
#             continue

### test

In [59]:
# Объединение df

In [60]:
test = df_submission.copy()

In [61]:
test = pd.merge(test, sum_user_dec, how='left', left_on='id', right_on='user_id')
test = test.drop(columns= ['decision_id'])
test.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4
0,10199,0,0,0,0,2,2,3,2
1,10539,0,0,0,0,1,9,15,17
2,10174,0,0,0,0,0,28,31,20
3,10465,0,0,0,0,15,18,32,18
4,10066,0,0,0,0,8,23,16,6
5,10425,0,0,0,0,2,0,7,4
6,10398,0,0,0,0,2,44,40,30
7,10346,0,0,0,0,16,19,24,4
8,10281,0,0,0,0,2,6,12,9
9,10158,0,0,0,0,3,6,1,7


In [62]:
#sumdec_user_dec = sumdec_user_dec.drop(columns= ['period'])
test = pd.merge(test, sumdec_user_dec, how='left', left_on='id', right_on='user_id')
test.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_427,decision_id_428,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437
0,10199,0,0,0,0,2,2,3,2,0,...,0,0,0,0,0,0,0,0,0,0
1,10539,0,0,0,0,1,9,15,17,1,...,1,0,0,0,0,0,0,0,0,0
2,10174,0,0,0,0,0,28,31,20,0,...,3,1,3,0,0,0,0,0,0,0
3,10465,0,0,0,0,15,18,32,18,0,...,0,0,0,0,0,0,0,0,0,0
4,10066,0,0,0,0,8,23,16,6,0,...,0,0,0,0,0,0,0,0,0,0
5,10425,0,0,0,0,2,0,7,4,0,...,0,0,0,0,0,0,0,0,0,0
6,10398,0,0,0,0,2,44,40,30,0,...,0,0,0,0,0,0,0,0,0,0
7,10346,0,0,0,0,16,19,24,4,0,...,0,0,0,0,0,0,0,0,0,0
8,10281,0,0,0,0,2,6,12,9,0,...,0,0,0,0,0,0,0,0,0,0
9,10158,0,0,0,0,3,6,1,7,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
test = pd.merge(test, df_user, how='left', left_on='id', right_on='user_id')
test = test.drop(columns= ['user_id'])
test.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_429,decision_id_430,decision_id_432,decision_id_433,decision_id_434,decision_id_435,decision_id_436,decision_id_437,team_id,game_id
0,10199,0,0,0,0,2,2,3,2,0,...,0,0,0,0,0,0,0,0,1912,262
1,10539,0,0,0,0,1,9,15,17,1,...,0,0,0,0,0,0,0,0,1458,204
2,10174,0,0,0,0,0,28,31,20,0,...,3,0,0,0,0,0,0,0,1348,190
3,10465,0,0,0,0,15,18,32,18,0,...,0,0,0,0,0,0,0,0,1760,244
4,10066,0,0,0,0,8,23,16,6,0,...,0,0,0,0,0,0,0,0,1260,183
5,10425,0,0,0,0,2,0,7,4,0,...,0,0,0,0,0,0,0,0,1724,241
6,10398,0,0,0,0,2,44,40,30,0,...,0,0,0,0,0,0,0,0,1908,262
7,10346,0,0,0,0,16,19,24,4,0,...,0,0,0,0,0,0,0,0,1656,234
8,10281,0,0,0,0,2,6,12,9,0,...,0,0,0,0,0,0,0,0,1660,234
9,10158,0,0,0,0,3,6,1,7,0,...,0,0,0,0,0,0,0,0,1708,238


In [64]:
# Объединение с point_cat2
test = pd.merge(test, point_cat2, how='left', left_on='team_id', right_on='team_id')
#train = train.drop(columns= ['user_id'])
test.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,decision_id_436,decision_id_437,team_id,game_id,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6
0,10199,0,0,0,0,2,2,3,2,0,...,0,0,1912,262,11,56,387,151,107,712
1,10539,0,0,0,0,1,9,15,17,1,...,0,0,1458,204,-18,-3,356,118,131,584
2,10174,0,0,0,0,0,28,31,20,0,...,0,0,1348,190,124,206,481,201,122,1134
3,10465,0,0,0,0,15,18,32,18,0,...,0,0,1760,244,-23,-19,422,132,114,626
4,10066,0,0,0,0,8,23,16,6,0,...,0,0,1260,183,-48,-80,314,115,94,395
5,10425,0,0,0,0,2,0,7,4,0,...,0,0,1724,241,59,145,532,155,71,962
6,10398,0,0,0,0,2,44,40,30,0,...,0,0,1908,262,22,134,353,178,94,781
7,10346,0,0,0,0,16,19,24,4,0,...,0,0,1656,234,48,119,367,144,142,820
8,10281,0,0,0,0,2,6,12,9,0,...,0,0,1660,234,-3,-30,368,129,73,537
9,10158,0,0,0,0,3,6,1,7,0,...,0,0,1708,238,-15,-45,315,108,75,438


In [65]:
# Объединение с point_per
test = pd.merge(test, point_per, how='left', left_on='team_id', right_on='team_id')
test.head(10)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,period_team_0,period_team_1,period_team_2,period_team_3
0,10199,0,0,0,0,2,2,3,2,0,...,11,56,387,151,107,712,316,404,330,182
1,10539,0,0,0,0,1,9,15,17,1,...,-18,-3,356,118,131,584,316,214,362,82
2,10174,0,0,0,0,0,28,31,20,0,...,124,206,481,201,122,1134,316,554,636,410
3,10465,0,0,0,0,15,18,32,18,0,...,-23,-19,422,132,114,626,316,322,288,160
4,10066,0,0,0,0,8,23,16,6,0,...,-48,-80,314,115,94,395,316,266,24,32
5,10425,0,0,0,0,2,0,7,4,0,...,59,145,532,155,71,962,316,436,410,342
6,10398,0,0,0,0,2,44,40,30,0,...,22,134,353,178,94,781,316,426,54,468
7,10346,0,0,0,0,16,19,24,4,0,...,48,119,367,144,142,820,316,304,254,236
8,10281,0,0,0,0,2,6,12,9,0,...,-3,-30,368,129,73,537,316,128,236,152
9,10158,0,0,0,0,3,6,1,7,0,...,-15,-45,315,108,75,438,316,188,10,62


In [66]:
#==============================================================================================

Узнаем частоту команд участника в каждом периоде

In [67]:
# df_user_train = df_user_dec[df_user_dec.user_id.map(lambda x:x in df_train.id.values)]
# df_decision_count = df_user_train.groupby(by=["user_id", "period"]).count()
# df_decision_count.head(5)

In [68]:
# for id in df_train.id.values:
#     for period in range(1,5):
#         try :
#             index = df_train[df_train.id == id].index[0]
#             df_train.loc[index,"period_"+str(period)] = df_decision_count.loc[pd.IndexSlice[id, period:period], :].values[0][0]
#         except IndexError:
#             continue
# df_train.head(3)

In [69]:
# df_train = df_train.fillna(0)

# plt.rcParams['figure.figsize']=(15,15)
# g = sns.heatmap(df_train.corr(), square = True, annot=True)

In [70]:
# plt.rcParams['figure.figsize']=(15,15)
# g = sns.heatmap(train.corr(), square = True, annot=True)

## Выделим выборки

In [71]:
# X = df_train.drop(["period_1",	"period_2",	"period_3",	"period_4","id"], axis = 1)
# y = df_train[["period_1",	"period_2",	"period_3",	"period_4"]]
# X.shape

In [72]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [73]:
train

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,period_team_0,period_team_1,period_team_2,period_team_3
0,10884,4.0,4.0,4.0,4.0,7,14,28,10,0,...,-5,-25,391,128,105,594,316,284,166,120
1,10106,5.0,4.0,5.0,5.0,21,21,12,8,0,...,54,120,467,160,113,914,316,338,468,398
2,10438,6.0,5.0,5.0,5.0,62,45,63,38,0,...,55,60,383,158,146,802,316,184,598,340
3,10130,5.0,4.0,5.0,5.0,26,28,34,66,0,...,31,24,366,139,107,667,316,164,160,300
4,10667,4.0,5.0,5.0,5.0,58,70,194,35,1,...,71,70,416,166,155,878,316,558,576,150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,10563,5.0,4.0,5.0,4.0,10,2,14,4,0,...,-4,-27,349,127,91,536,316,226,174,102
676,10386,5.0,4.0,5.0,5.0,1,14,24,1,0,...,37,108,471,154,62,832,316,364,282,396
677,10675,5.0,4.0,5.0,4.0,4,17,23,12,0,...,31,86,399,136,81,733,316,338,244,230
678,10366,4.0,4.0,4.0,4.0,10,44,45,38,1,...,45,108,425,164,100,842,316,426,410,392


In [74]:
col_name = ['Analytical thinking', 'Systemic thinking', 'Adaptability', 'Focus']
X = train.drop(columns=col_name)
y = train[col_name]

In [75]:
X

Unnamed: 0,id,period_1,period_2,period_3,period_4,decision_id_1,decision_id_3,decision_id_4,decision_id_5,decision_id_6,...,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,period_team_0,period_team_1,period_team_2,period_team_3
0,10884,7,14,28,10,0,0,0,0,0,...,-5,-25,391,128,105,594,316,284,166,120
1,10106,21,21,12,8,0,0,0,0,0,...,54,120,467,160,113,914,316,338,468,398
2,10438,62,45,63,38,0,0,0,0,0,...,55,60,383,158,146,802,316,184,598,340
3,10130,26,28,34,66,0,0,0,0,0,...,31,24,366,139,107,667,316,164,160,300
4,10667,58,70,194,35,1,0,0,0,1,...,71,70,416,166,155,878,316,558,576,150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675,10563,10,2,14,4,0,0,0,0,0,...,-4,-27,349,127,91,536,316,226,174,102
676,10386,1,14,24,1,0,0,0,0,0,...,37,108,471,154,62,832,316,364,282,396
677,10675,4,17,23,12,0,0,0,0,0,...,31,86,399,136,81,733,316,338,244,230
678,10366,10,44,45,38,1,0,0,0,0,...,45,108,425,164,100,842,316,426,410,392


In [76]:
y

Unnamed: 0,Analytical thinking,Systemic thinking,Adaptability,Focus
0,4.0,4.0,4.0,4.0
1,5.0,4.0,5.0,5.0
2,6.0,5.0,5.0,5.0
3,5.0,4.0,5.0,5.0
4,4.0,5.0,5.0,5.0
...,...,...,...,...
675,5.0,4.0,5.0,4.0
676,5.0,4.0,5.0,5.0
677,5.0,4.0,5.0,4.0
678,4.0,4.0,4.0,4.0


In [77]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2)

## Обучение модели

In [78]:
clf = RandomForestClassifier(random_state=0)

In [79]:
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

## Оценка точности

In [80]:
y_pred = clf.predict(X_val)

In [81]:
y_pred

array([[4., 4., 5., 4.],
       [5., 4., 5., 4.],
       [4., 4., 5., 4.],
       [5., 4., 5., 5.],
       [4., 4., 5., 5.],
       [4., 4., 5., 4.],
       [5., 4., 5., 5.],
       [4., 4., 4., 4.],
       [4., 4., 5., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 5., 5.],
       [4., 4., 4., 4.],
       [4., 4., 5., 4.],
       [4., 4., 5., 5.],
       [4., 4., 5., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 5., 4.],
       [4., 4., 5., 5.],
       [4., 4., 5., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [5., 4., 5., 5.],
       [4., 4., 4., 4.],
       [4., 4., 5., 4.],
       [4., 4., 5., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 5., 4.],
       [4., 4., 5., 4.],
       [5., 4., 5., 5.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],


In [82]:
y_val.head(10)

Unnamed: 0,Analytical thinking,Systemic thinking,Adaptability,Focus
284,4.0,4.0,4.0,4.0
448,5.0,5.0,5.0,5.0
199,4.0,3.0,4.0,4.0
340,6.0,5.0,5.0,4.0
258,5.0,4.0,4.0,4.0
30,4.0,4.0,5.0,5.0
273,4.0,5.0,4.0,5.0
457,4.0,3.0,4.0,4.0
536,5.0,4.0,5.0,5.0
157,5.0,4.0,4.0,5.0


In [83]:
#col_name = ['Analytical thinking', 'Systemic thinking', 'Adaptability', 'Focus']
result = 0 
for i in range(4):
      result += recall_score(y_val[col_name[i]], y_pred[:,i], average='macro', zero_division=True)
print("Recall score",result/4)
#Recall score 0.2523583639392419
#Recall score 0.2591790004180355
#Recall score 0.3049374818042581  04
#Recall score 0.31133326529708105 05
#Recall score 0.28124723503835347 06
#Recall score 0.27147606627047416 07
#Recall score 0.27042691561606036 08
#Recall score 0.2661326902061551  09
#Recall score 0.2713601435723146  10
#Recall score 0.27476065743061356 11
#Recall score 0.2673355337798696  12 rs8
#Recall score 0.29545897470026444 12 rs2 

Recall score 0.29545897470026444


## Предсказание

In [84]:
test.head(1)

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus,period_1,period_2,period_3,period_4,decision_id_1,...,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,period_team_0,period_team_1,period_team_2,period_team_3
0,10199,0,0,0,0,2,2,3,2,0,...,11,56,387,151,107,712,316,404,330,182


In [85]:
#test = test[["period_1", "period_2", "period_3", "period_4", "period_sum"]]
test = test.drop(columns=col_name)
test

Unnamed: 0,id,period_1,period_2,period_3,period_4,decision_id_1,decision_id_3,decision_id_4,decision_id_5,decision_id_6,...,category_id_1,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,period_team_0,period_team_1,period_team_2,period_team_3
0,10199,2,2,3,2,0,0,0,0,0,...,11,56,387,151,107,712,316,404,330,182
1,10539,1,9,15,17,1,0,1,0,0,...,-18,-3,356,118,131,584,316,214,362,82
2,10174,0,28,31,20,0,0,0,0,0,...,124,206,481,201,122,1134,316,554,636,410
3,10465,15,18,32,18,0,0,0,0,0,...,-23,-19,422,132,114,626,316,322,288,160
4,10066,8,23,16,6,0,0,0,0,0,...,-48,-80,314,115,94,395,316,266,24,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,10433,18,7,1,0,0,0,0,0,0,...,126,262,472,239,126,1225,316,512,292,654
289,10893,21,52,34,20,1,1,1,0,0,...,58,157,413,200,105,933,316,238,350,630
290,10909,0,0,0,1,0,0,0,0,0,...,27,85,379,152,122,765,316,358,328,376
291,10889,6,2,5,1,0,0,0,0,0,...,102,197,512,199,58,1068,316,238,528,610


In [86]:
test_pred = clf.predict(test)
test_pred

array([[4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       ...,
       [4., 4., 4., 4.],
       [4., 4., 4., 4.],
       [4., 4., 4., 4.]])

In [87]:
test_pred[:,2]

array([4., 4., 4., 4., 4., 5., 5., 5., 4., 4., 4., 5., 5., 4., 4., 5., 5.,
       4., 5., 5., 5., 5., 4., 4., 4., 5., 4., 4., 4., 5., 4., 5., 4., 5.,
       5., 4., 4., 4., 5., 5., 5., 4., 4., 5., 5., 5., 5., 4., 4., 4., 4.,
       5., 4., 4., 4., 5., 4., 5., 5., 4., 5., 4., 5., 4., 4., 5., 4., 5.,
       5., 5., 3., 4., 5., 4., 5., 5., 4., 5., 5., 5., 4., 4., 5., 5., 4.,
       4., 5., 4., 5., 4., 5., 4., 4., 4., 5., 4., 5., 4., 5., 5., 5., 4.,
       4., 4., 4., 4., 5., 5., 5., 4., 5., 5., 5., 5., 4., 5., 5., 4., 4.,
       5., 5., 5., 5., 4., 4., 5., 5., 5., 4., 5., 4., 4., 5., 4., 5., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 5., 4., 5., 5., 4., 4.,
       4., 4., 4., 4., 4., 5., 5., 4., 5., 4., 5., 4., 5., 4., 4., 4., 5.,
       5., 5., 4., 5., 4., 5., 4., 4., 5., 4., 4., 4., 5., 5., 5., 5., 5.,
       4., 5., 5., 4., 5., 5., 4., 4., 4., 4., 4., 4., 5., 4., 5., 5., 4.,
       5., 4., 4., 4., 4., 5., 4., 4., 5., 4., 4., 5., 5., 5., 4., 4., 5.,
       4., 4., 4., 4., 5.

In [88]:
df_test_pred = pd.DataFrame(test_pred, columns = col_name).astype(int)
df_test_pred

Unnamed: 0,Analytical thinking,Systemic thinking,Adaptability,Focus
0,4,4,4,4
1,4,4,4,4
2,4,4,4,4
3,4,4,4,4
4,4,4,4,4
...,...,...,...,...
288,4,4,5,4
289,4,4,5,5
290,4,4,4,4
291,4,4,4,4


In [89]:
result = pd.read_csv("sample_solution.csv")
result[col_name] = df_test_pred
result

Unnamed: 0,id,Analytical thinking,Systemic thinking,Adaptability,Focus
0,10199,4,4,4,4
1,10539,4,4,4,4
2,10174,4,4,4,4
3,10465,4,4,4,4
4,10066,4,4,4,4
...,...,...,...,...,...
288,10433,4,4,5,4
289,10893,4,4,5,5
290,10909,4,4,4,4
291,10889,4,4,4,4


In [90]:
result.to_csv('221015_12 submission.csv', index=None) 
result.to_csv('submission.csv', index=None)

In [91]:
#0.000000 game 221005_00_0  all 0   err
#err      game 221005_00_1  all 0.25 
#0.245833 game 221005_00_2  all 5 
#0.245833 game 221005_00_3  all 4  
#0.245833 game 221005_00_4  mode   
#0.261236 game 221005_01           
#0.251555 game 221005_01_1  +decision_id  [-]
#0.262223 game 221005_01_2  test_size=0.2
#0.243821 221006_02   +period_sum
#0.262754 221008_04   +df_user
#0.264872 221008_05   +df_user -period_sum
#0.220088 221008_06   +sumdec_user_dec -period
#0.264023 221008_07   +sumdec_user_dec +period
#0.269453 221008_08   +sumdec_user_dec +period +df_user
#0.279554 221008_09   +sumdec_user_dec +period +df_user +df_point
#0.281100 221015_10   +sumdec_user_dec +period +df_user +df_point +point_per 
#0.277475 221015_11   -sumdec_user_dec +period +df_user +df_point +point_per random_state=42
#0.283424 221015_12   +sumdec_user_dec +period +df_user +df_point +point_per random_state=2