In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Reading DF

In [40]:
df = pd.read_csv('breast-cancer.data', header=None)
df.columns = ['y', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 
              'deg-malig', 'side', 'quad', 'irradiat']
df

Unnamed: 0,y,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,side,quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
5,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no
6,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
7,no-recurrence-events,60-69,ge40,20-24,0-2,no,1,left,left_low,no
8,no-recurrence-events,40-49,premeno,50-54,0-2,no,2,left,left_low,no
9,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,left_up,no


## Fixing DF

### Remove NaN

In [21]:
len(df)

286

In [41]:
df = df.applymap(lambda d: d if not d == '?' else np.nan)
df = df.dropna()
len(df)

277

9 NaNs dropped, this match the data description

### Encode categorical values

In [42]:
df = df.reset_index(drop=True)
df = pd.get_dummies(df, prefix_sep='_')
df = df[[i for i in df.columns if not i in ['y_no-recurrence-events', 'node-caps_no', 'irradiat_no']]]
df

Unnamed: 0,deg-malig,y_recurrence-events,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,...,inv-nodes_9-11,node-caps_yes,side_left,side_right,quad_central,quad_left_low,quad_left_up,quad_right_low,quad_right_up,irradiat_yes
0,3,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,2,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,2,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,2,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
4,2,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,2,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,0
6,2,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
7,1,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,0
8,2,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
9,2,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


## Train-Test Split

In [68]:
X = df[[i for i in df.columns if not i == 'y_recurrence-events']]
y = df[['y_recurrence-events']]

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,
                                                    stratify=df['y_recurrence-events'].values)

In [96]:
X_train

Unnamed: 0,deg-malig,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,...,inv-nodes_9-11,node-caps_yes,side_left,side_right,quad_central,quad_left_low,quad_left_up,quad_right_low,quad_right_up,irradiat_yes
193,2,0,0,0,1,0,0,1,0,0,...,0,1,1,0,1,0,0,0,0,1
120,1,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
222,3,0,0,0,0,1,0,1,0,0,...,0,1,0,1,0,1,0,0,0,0
170,2,0,0,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,0
100,3,0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
255,3,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
25,2,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
242,2,0,1,0,0,0,0,0,0,1,...,0,1,1,0,0,1,0,0,0,0
64,1,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
156,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0


In [97]:
y_train

Unnamed: 0,y_recurrence-events
193,0
120,0
222,1
170,0
100,0
255,1
25,0
242,1
64,0
156,0


In [98]:
print('0s no treino:', len(y_train[y_train['y_recurrence-events'] == 0])/len(y_train))
print('0s no teste:', len(y_test[y_test['y_recurrence-events'] == 0])/len(y_test))
print('0s no dataset', len(y[y['y_recurrence-events'] == 0])/len(y))

0s no treino: 0.7098445595854922
0s no teste: 0.7023809523809523
0s no dataset 0.7075812274368231
