In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from scipy.stats import pearsonr

### Датасет с базовыми признаками

In [3]:
data = pd.read_csv('data_preprocessed/data_3511.csv', index_col='Unnamed: 0')
data

Unnamed: 0,WeightClass,Gender,NumberOfRounds,RedAge,RedHeightCms,RedReachCms,RedWeightLbs,RedStance,RedWins,RedWinsBySubmission,...,BlueAvgSigStrPct,BlueAvgSubAtt,BlueAvgTDLanded,BlueAvgTDPct,BlueTotalRoundsFought,Result,RedWinsByDecision,RedWinsByKO/TKO,BlueWinsByDecision,BlueWinsByKO/TKO
1,4,1,0.0,30,185.42,195.58,170,1.0,6,5,...,0.550,0.30,0.7700,0.550,20,0,0,1,5,3
2,7,1,0.0,34,193.04,205.74,245,1.0,9,2,...,0.570,0.20,0.4500,0.630,44,0,3,4,5,6
3,2,1,0.0,30,177.80,177.80,145,0.0,7,1,...,0.440,0.50,0.4700,0.250,7,0,6,0,0,0
4,2,1,0.0,36,175.26,182.88,145,1.0,5,2,...,0.530,0.80,0.7500,0.370,15,1,2,1,0,4
5,6,1,0.0,34,193.04,195.58,205,0.0,7,1,...,0.490,0.60,0.4600,0.260,63,0,2,4,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6508,7,1,1.0,30,190.50,200.66,264,0.0,11,7,...,0.437,0.00,0.3333,0.167,3,1,1,2,0,3
6512,4,1,0.0,24,182.88,193.04,170,0.0,6,5,...,0.275,0.00,0.0000,0.000,2,0,1,0,0,1
6518,7,1,0.0,26,193.04,195.58,238,1.0,4,0,...,0.548,0.70,1.2000,0.572,16,0,0,4,0,5
6521,3,1,0.0,28,170.18,177.80,155,1.0,5,1,...,0.333,1.25,1.2500,0.663,7,0,3,1,0,0


Расчет взаимной информации

In [13]:
X = data.drop('Result', axis=1)
y = data['Result']

feature_names = X.columns.tolist()  # список названий признаков

# Расчет взаимной информации для каждого признака
mi = mutual_info_classif(X, y)

# Отбор k наиболее информативных признаков
k = 12
selector = SelectKBest(mutual_info_classif, k=k)
X_selected = selector.fit_transform(X, y)

# Получение индексов отобранных признаков
indices = selector.get_support(indices=True)

# Вывод названий отобранных признаков
print("Отобранные признаки:")
for index in indices:
    print(feature_names[index])

Отобранные признаки:
RedAge
RedReachCms
RedWeightLbs
RedWins
RedAvgSigStrPct
RedAvgTDPct
BlueHeightCms
BlueReachCms
BlueAvgSigStrLanded
BlueAvgTDLanded
BlueTotalRoundsFought
BlueWinsByKO/TKO


Корреляционный анализ

In [18]:
# Расчет корреляции для каждого признака
corrs = []
for i in range(X.shape[1]):
    corr, _ = pearsonr(X.iloc[:, i], y)
    corrs.append(corr)

# Отбор k наиболее коррелированных признаков
k = 12
idx = np.argsort(np.abs(corrs))[-k:]
X_selected = X.iloc[:, idx]
X_selected

Unnamed: 0,RedAvgTDPct,RedCurrentLoseStreak,BlueCurrentWinStreak,BlueAvgSigStrPct,BlueLosses,RedCurrentWinStreak,RedAvgTDLanded,BlueAvgTDLanded,RedAvgSigStrPct,RedLosses,RedAge,BlueAge
1,0.290,0,8,0.550,0,6,1.4900,0.7700,0.610,0,30,27
2,0.210,0,4,0.570,4,1,0.5800,0.4500,0.600,2,34,36
3,0.410,1,0,0.440,2,0,3.4500,0.4700,0.580,2,30,36
4,0.410,0,1,0.530,3,1,1.0000,0.7500,0.460,3,36,33
5,0.280,0,0,0.490,12,1,0.3500,0.4600,0.500,4,34,36
...,...,...,...,...,...,...,...,...,...,...,...,...
6508,0.351,0,3,0.437,0,1,0.8000,0.3333,0.520,4,30,35
6512,0.476,1,0,0.275,1,0,1.3333,0.0000,0.409,3,24,28
6518,0.000,0,1,0.548,3,4,0.0000,1.2000,0.538,0,26,30
6521,0.382,2,0,0.333,2,0,3.4000,1.2500,0.394,5,28,28


### Датасет с созданными признаками

In [21]:
data = pd.read_csv('data_preprocessed/data_featured_3511.csv', index_col='Unnamed: 0')
data

Unnamed: 0,WeightClass,Gender,NumberOfRounds,RedStance,BlueStance,Result,Age_diff,Height_diff,Reach_diff,Weight_diff,...,Blue_wins_loss_diff,Red_Blue_Wins_diff,Red_Blue_Loss_diff,Red_Blue_wins_loss_diff,AvgSigStrLanded_diff,AvgSigStrPct_diff,TDAvgLanded_diff,AvgTDPct_diff,AvgSubAtt_diff,Red_Blue_curr_streak_diff
1,4,1,0.0,1.0,1.0,0,3,-5.08,7.62,0,...,8,-2,0,-2,-1.38,0.060,0.7200,-0.260,1.5000,-2
2,7,1,0.0,1.0,1.0,0,-2,-7.62,2.54,-5,...,8,-3,-2,-1,0.36,0.030,0.1300,-0.420,0.3000,-3
3,2,1,0.0,0.0,0.0,0,-6,2.54,0.00,0,...,-1,6,0,6,-1.44,0.140,2.9800,0.160,1.1000,1
4,2,1,0.0,1.0,1.0,1,3,-2.54,5.08,0,...,1,1,0,1,1.84,-0.070,0.2500,0.040,0.2000,0
5,6,1,0.0,0.0,1.0,0,-2,0.00,2.54,0,...,3,-8,-8,0,1.63,0.010,-0.1100,0.020,-0.3000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6508,7,1,1.0,0.0,1.0,1,-5,2.54,-2.54,-1,...,3,8,4,4,3.00,0.083,0.4667,0.184,1.2667,-2
6512,4,1,0.0,0.0,1.0,0,-4,0.00,12.70,0,...,0,5,2,3,0.57,0.134,1.3333,0.476,1.4444,0
6518,7,1,0.0,1.0,1.0,0,-4,5.08,2.54,-4,...,4,-3,-3,0,-9.04,-0.010,-1.2000,-0.572,-0.7000,3
6521,3,1,0.0,1.0,1.0,0,0,-7.62,-2.54,0,...,0,3,3,0,-8.18,0.061,2.1500,-0.281,-0.6500,-1


Расчет взаимной информации

In [22]:
X = data.drop('Result', axis=1)
y = data['Result']

feature_names = X.columns.tolist()  # список названий признаков

# Расчет взаимной информации для каждого признака
mi = mutual_info_classif(X, y)

# Отбор k наиболее информативных признаков
k = 12
selector = SelectKBest(mutual_info_classif, k=k)
X_selected = selector.fit_transform(X, y)

# Получение индексов отобранных признаков
indices = selector.get_support(indices=True)

# Вывод названий отобранных признаков
print("Отобранные признаки:")
for index in indices:
    print(feature_names[index])

Отобранные признаки:
NumberOfRounds
RedStance
BlueStance
Age_diff
Height_diff
Red_Blue_wins_loss_diff
AvgSigStrLanded_diff
AvgSigStrPct_diff
TDAvgLanded_diff
AvgTDPct_diff
AvgSubAtt_diff
Red_Blue_curr_streak_diff


Корреляционный анализ

In [23]:
# Расчет корреляции для каждого признака
corrs = []
for i in range(X.shape[1]):
    corr, _ = pearsonr(X.iloc[:, i], y)
    corrs.append(corr)

# Отбор k наиболее коррелированных признаков
k = 12
idx = np.argsort(np.abs(corrs))[-k:]
X_selected = X.iloc[:, idx]
X_selected

Unnamed: 0,Red_wins_loss_diff,Height_diff,Red_Blue_wins_loss_diff,Reach_diff,AvgSubAtt_diff,Red_Blue_Wins_diff,AvgTDPct_diff,AvgSigStrPct_diff,TDAvgLanded_diff,Red_Blue_curr_streak_diff,Red_Blue_Loss_diff,Age_diff
1,6,-5.08,-2,7.62,1.5000,-2,-0.260,0.060,0.7200,-2,0,3
2,7,-7.62,-1,2.54,0.3000,-3,-0.420,0.030,0.1300,-3,-2,-2
3,5,2.54,6,0.00,1.1000,6,0.160,0.140,2.9800,1,0,-6
4,2,-2.54,1,5.08,0.2000,1,0.040,-0.070,0.2500,0,0,3
5,3,0.00,0,2.54,-0.3000,-8,0.020,0.010,-0.1100,2,-8,-2
...,...,...,...,...,...,...,...,...,...,...,...,...
6508,7,2.54,4,-2.54,1.2667,8,0.184,0.083,0.4667,-2,4,-5
6512,3,0.00,3,12.70,1.4444,5,0.476,0.134,1.3333,0,2,-4
6518,4,5.08,0,2.54,-0.7000,-3,-0.572,-0.010,-1.2000,3,-3,-4
6521,0,-7.62,0,-2.54,-0.6500,3,-0.281,0.061,2.1500,-1,3,0
