# Кодирование категориальных признаков и их полезность



In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
import pandas as pd
import seaborn as sns
from sklearn import tree
from sklearn import preprocessing

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/students.csv', delimiter=',')
df=df.dropna()
df.info()

In [49]:
coder = preprocessing.LabelEncoder()

In [50]:
for name in ['Sex','Coin','Animal','Army','Glasses','Your rating in university',
             'Fastfood','Hostel','Chocolate','Brother-sister','Plane seat','Problems in last semester',
             'Rock paper scissors','Strange people','Your insitute']:
  coder.fit(df[name])
  df[name]=coder.transform(df[name])

In [None]:
df.info()

## Узнаем важность каждого признака в задаче предсказания пола человека

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
selector=ExtraTreesClassifier()
result=selector.fit(df[df.columns], df['Sex'])
result.feature_importances_

In [None]:
features_table= pd.DataFrame(result.feature_importances_, index =df.columns,
                                              columns =['importance'])
print(features_table)

In [None]:
features_table.sort_values(by='importance', ascending=False)

А далее нужно оставить лишь несколько признаков из топа по значимости и по ним тренировать модель

Допустим мы хотим оставить 7 признаков:

Army	0.092844

Shoe size	0.090263

Hair length	0.063735

Growth	0.063282

Coin	0.034178

Computer science rating	0.022314

Weight 0.012210

Давайте заново загрузим данные, так как мы меняли значения в категориальных столбцах

In [55]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/students.csv', delimiter=',')

In [None]:
df_cut=df[['Army','Shoe size', 'Hair length', 'Growth', 'Coin', 'Computer science rating', 'Weight','Sex']]
df_cut=df_cut.dropna()
df_cut.info()

In [None]:
df_cut=pd.get_dummies(df_cut,drop_first=True)
df_cut.head()

Гипотеза: угадать пол человека можно за небольшое количество вопросов. И модель эти вопросы найдет автоматически.

In [None]:
model=tree.DecisionTreeClassifier(max_depth=3)
model.fit(df_cut[['Army_не призовут (по разным причинам)','Shoe size', 'Hair length', 'Growth', 'Coin_Решка', 'Computer science rating', 'Weight']].values.reshape(-1,7), y=df_cut['Sex_мужской'].values)

In [None]:
import graphviz
dot_data = tree.export_graphviz(model, out_file=None,
                      feature_names=['Army_не призовут (по разным причинам)','Shoe size', 'Hair length', 'Growth', 'Coin_Решка', 'Computer science rating', 'Weight'],
                      class_names=['f', 'm'],
                     filled=True, rounded=True,
                      special_characters=True)
graph = graphviz.Source(dot_data)
graph

Загружаем тестовую выбоку

In [60]:
df_test=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/students_test.csv', delimiter=',')

In [61]:
df_cut_test=df_test[['Army','Shoe size', 'Hair length', 'Growth', 'Coin', 'Computer science rating', 'Weight','Sex']]
df_cut_test=df_cut_test.dropna()

In [None]:
df_cut_test=pd.get_dummies(df_cut_test,drop_first=True)
df_cut_test.head()

Внимание: через функцию predict прогоняем объекты **тестовой** выборки

In [63]:
df_cut_test['Predicted']=model.predict(df_cut_test[['Army_не призовут (по разным причинам)','Shoe size', 'Hair length', 'Growth', 'Coin_Решка', 'Computer science rating', 'Weight']].values.reshape(-1,7))

Метрики качества задачи классификации

In [64]:
pd.crosstab(df_cut_test['Predicted'],df_cut_test['Sex_мужской'])

Sex_мужской,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,46,1
True,1,29


In [None]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/students.csv', delimiter=',')
df=df.dropna()
df.info()

In [70]:
coder = preprocessing.LabelEncoder()

In [71]:
for name in ['Sex','Coin','Animal','Army','Glasses','Your rating in university',
             'Fastfood','Hostel','Chocolate','Brother-sister','Plane seat','Problems in last semester',
             'Rock paper scissors','Strange people','Your insitute']:
  coder.fit(df[name])
  df[name]=coder.transform(df[name])

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 118 entries, 0 to 185
Data columns (total 48 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          118 non-null    int64  
 1   Growth                       118 non-null    int64  
 2   Shoe size                    118 non-null    int64  
 3   Course number                118 non-null    int64  
 4   Year of birth                118 non-null    int64  
 5   Friend number                118 non-null    int64  
 6   Russian rating               118 non-null    int64  
 7   Maths rating                 118 non-null    int64  
 8   Physics rating               118 non-null    int64  
 9   Computer science rating      118 non-null    int64  
 10  Chemistry rating             118 non-null    int64  
 11  Literature rating            118 non-null    int64  
 12  History rating               118 non-null    int64  
 13  Geography rating         

In [73]:
from sklearn.ensemble import ExtraTreesClassifier
selector=ExtraTreesClassifier()
result=selector.fit(df[df.columns], df['Your insitute'])
result.feature_importances_

array([0.01422454, 0.01857968, 0.01418577, 0.0094621 , 0.01032204,
       0.01827852, 0.014046  , 0.05228611, 0.01013361, 0.08916741,
       0.00862925, 0.02927065, 0.05001606, 0.0138034 , 0.01422417,
       0.02152517, 0.07253462, 0.01151127, 0.01628185, 0.01578865,
       0.01157825, 0.01505602, 0.00921913, 0.01468082, 0.00654427,
       0.01244722, 0.01141509, 0.00704038, 0.013302  , 0.00766445,
       0.00911548, 0.01448382, 0.01624931, 0.0146191 , 0.00666042,
       0.00529587, 0.01561786, 0.0112851 , 0.01054304, 0.01930296,
       0.0128938 , 0.01382474, 0.18068188, 0.00904174, 0.01135008,
       0.01176672, 0.01337885, 0.01067076])

In [74]:
features_table= pd.DataFrame(result.feature_importances_, index =df.columns,
                                              columns =['importance'])
print(features_table)

                             importance
Age                            0.014225
Growth                         0.018580
Shoe size                      0.014186
Course number                  0.009462
Year of birth                  0.010322
Friend number                  0.018279
Russian rating                 0.014046
Maths rating                   0.052286
Physics rating                 0.010134
Computer science rating        0.089167
Chemistry rating               0.008629
Literature rating              0.029271
History rating                 0.050016
Geography rating               0.013803
Biology rating                 0.014224
Foreign language rating        0.021525
Social science rating          0.072535
Distance to home km            0.011511
Minutes to first class         0.016282
Children number                0.015789
Removed teeth                  0.011578
Weight                         0.015056
Glasses                        0.009219
Sex                            0.014681


In [None]:
features_table.sort_values(by='importance', ascending=False)