In [44]:
import sklearn.feature_selection as feature_selection
import pandas as pd
import numpy as np
import re

In [45]:
df = pd.read_csv('cow.csv')
df.drop(columns=['hospital_number', ], inplace=True)
df

Unnamed: 0,surgery,age,temperature,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome Class,surgical_lesion,lesion
0,no,adult,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,...,,decreased,distend_large,45.0,8.4,,,died,no,11300
1,yes,adult,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,...,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208
2,no,adult,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,...,,normal,normal,33.0,6.7,,,lived,no,0
3,yes,young,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,...,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208
4,no,adult,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,...,,,,74.0,7.4,,,died,no,4300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,depressed,...,,,distend_large,55.0,65.0,,,euthanized,no,3205
295,no,adult,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,severe_pain,...,,absent,distend_small,44.0,,serosanguious,3.3,euthanized,yes,2208
296,yes,adult,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,severe_pain,...,,decreased,distend_large,60.0,6.8,,,died,yes,3205
297,yes,adult,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,mild_pain,...,,absent,distend_small,50.0,6.0,serosanguious,3.4,lived,yes,2208


In [46]:
def extract_lesion_column(lesion):
    site, type_l, subtype, code = 'None', 'None', 'None', 'None'
    if lesion != 0:
        types = re.findall('^([1-9]|11|00)([1-4])([0-2])([1-9]|10|0)',str(lesion))
        if types:
            site, type_l, subtype, code = types[0]
    return pd.Series({'site': site, 'type': type_l, 'subtype': subtype, 'code': code}, dtype='category')

df.loc[:, ['site', 'type', 'subtype', 'code']] = df.lesion.apply(extract_lesion_column).astype('category')
df.drop(columns='lesion', inplace=True)

In [47]:
categorical_columns = []
for col in df:
    if str(df[col].dtype) != 'category' and df[col].unique().size < 10:
        df[col].fillna('[NAN]', inplace=True)
        categorical_columns.append(col)
        df[col] = df[col].astype('category')

In [48]:
categorical_columns

['surgery',
 'age',
 'temp_of_extremities',
 'peripheral_pulse',
 'mucous_membrane',
 'capillary_refill_time',
 'pain',
 'peristalsis',
 'abdominal_distention',
 'nasogastric_tube',
 'nasogastric_reflux',
 'rectal_exam_feces',
 'abdomen',
 'abdomo_appearance',
 'outcome Class',
 'surgical_lesion']

In [49]:
X = []
X_labels = []
Y = None
for col in df:
    if col == 'outcome Class':
        Y = df[col].cat.codes.values
    elif str(df[col].dtype) != 'category':
        X.append(df[col].fillna(df[col].mean()).values,)
        X_labels.append(col)
    else:
        X.append(df[col].cat.codes.values)
        X_labels.append(col)
X = np.array(X)

In [50]:
selector = feature_selection.SelectPercentile(feature_selection.chi2,percentile=50).fit(X.T, Y)
top_k = selector.transform(X.T).shape[1]
[X_labels[i] for i in selector.scores_.argsort()[::-1]][:top_k]

['total_protein',
 'pulse',
 'subtype',
 'packed_cell_volume',
 'site',
 'code',
 'abdomen',
 'abdomo_appearance',
 'respiratory_rate',
 'rectal_exam_feces',
 'type',
 'temp_of_extremities',
 'surgical_lesion']

In [51]:
selector = feature_selection.SelectPercentile(feature_selection.f_classif,percentile=50).fit(X.T, Y)
top_k = selector.transform(X.T).shape[1]
[X_labels[i] for i in selector.scores_.argsort()[::-1]][:top_k]

['subtype',
 'packed_cell_volume',
 'total_protein',
 'site',
 'pulse',
 'surgical_lesion',
 'capillary_refill_time',
 'temp_of_extremities',
 'peristalsis',
 'abdomo_protein',
 'code',
 'surgery',
 'abdomo_appearance']

In [52]:
df.temp_of_extremities.value_counts()

cool      108
normal     78
[NAN]      56
warm       30
cold       27
Name: temp_of_extremities, dtype: int64