Attribute Information:

1. Age of patient at time of operation (numerical)
2. Patient's year of operation (year - 1900, numerical)
3. Number of positive axillary nodes detected (numerical)
4. Survival status (class attribute)
-- 1 = the patient survived 5 years or longer
-- 2 = the patient died within 5 year

In [230]:
from sklearn.model_selection import train_test_split

In [231]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [232]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [233]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    
    return f1, roc, rec, prc

In [234]:
results = {'approach': [],
           'f1': [],
          'roc_auc': [],
          'recall': [],
          'precision': []}

In [235]:
result_df = pd.DataFrame.from_dict(results)

In [236]:
import pandas as pd
import numpy as np
data = pd.read_csv("haberman.data", header=None)
data.head(3)

Unnamed: 0,0,1,2,3
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1


In [237]:
data.describe()

Unnamed: 0,0,1,2,3
count,306.0,306.0,306.0,306.0
mean,52.457516,62.852941,4.026144,1.264706
std,10.803452,3.249405,7.189654,0.441899
min,30.0,58.0,0.0,1.0
25%,44.0,60.0,0.0,1.0
50%,52.0,63.0,1.0,1.0
75%,60.75,65.75,4.0,2.0
max,83.0,69.0,52.0,2.0


In [238]:
data.iloc[:, -1].value_counts()

1    225
2     81
Name: 3, dtype: int64

In [239]:
data.shape

(306, 4)

In [240]:
target_replace = {1: 0, 2: 1}

In [241]:
data[3].replace(target_replace).value_counts()

0    225
1     81
Name: 3, dtype: int64

In [242]:
data[3] = data[3].replace(target_replace)

In [243]:
data.describe()

Unnamed: 0,0,1,2,3
count,306.0,306.0,306.0,306.0
mean,52.457516,62.852941,4.026144,0.264706
std,10.803452,3.249405,7.189654,0.441899
min,30.0,58.0,0.0,0.0
25%,44.0,60.0,0.0,0.0
50%,52.0,63.0,1.0,0.0
75%,60.75,65.75,4.0,1.0
max,83.0,69.0,52.0,1.0


In [244]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       306 non-null    int64
 1   1       306 non-null    int64
 2   2       306 non-null    int64
 3   3       306 non-null    int64
dtypes: int64(4)
memory usage: 9.7 KB


In [245]:
X_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

#### обучить любой классификатор (какой вам нравится)

In [246]:
rfc = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_predict = rfc.predict(X_test)
f1, roc, rec, prc = evaluate_results(y_test, y_predict)

Classification results:
f1: 35.90%
roc: 58.92%
recall: 26.92%
precision: 53.85%


In [247]:
result_df = result_df.append({'approach' : 'RandomForest',
                          'f1' : f1,
                          'roc_auc' : roc,
                        'recall' : rec,
                          'precision' : prc}, ignore_index=True)

### PU learning

In [248]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 21/81 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [249]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    285
 1     21
Name: class_test, dtype: int64


In [250]:
mod_data.head(3)

Unnamed: 0,0,1,2,3,class_test
0,30,64,1,0,-1
1,30,62,3,0,-1
2,30,65,0,0,-1


In [251]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

#### применить random negative sampling для построения классификатора в новых условиях

In [252]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(21, 5) (21, 5)


In [253]:
model = RandomForestClassifier(random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
f1, roc, rec, prc = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 43.14%
roc: 63.72%
recall: 77.19%
precision: 29.93%


In [254]:
result_df = result_df.append({'approach' : 'PU_0.2_RandomForest',
                          'f1' : f1,
                          'roc_auc' : roc,
                        'recall' : rec,
                          'precision' : prc}, ignore_index=True)

#### сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [255]:
result_df

Unnamed: 0,approach,f1,roc_auc,recall,precision
0,RandomForest,0.358974,0.589161,0.269231,0.538462
1,PU_0.2_RandomForest,0.431373,0.637173,0.77193,0.29932


### поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [256]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 41/81 as positives and unlabeling the rest


In [257]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    265
 1     41
Name: class_test, dtype: int64


In [258]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [259]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(41, 5) (41, 5)


In [260]:
model = RandomForestClassifier(random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
f1, roc, rec, prc = evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 33.75%
roc: 63.65%
recall: 79.41%
precision: 21.43%


In [261]:
result_df = result_df.append({'approach' : 'PU_0.5_RandomForest',
                          'f1' : f1,
                          'roc_auc' : roc,
                        'recall' : rec,
                          'precision' : prc}, ignore_index=True)

In [262]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
pos_sample = pos_ind[:pos_sample_len]
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
model = RandomForestClassifier(random_state=42)
model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
f1, roc, rec, prc = evaluate_results(sample_test.iloc[:,-2].values, y_predict)
result_df = result_df.append({'approach' : 'PU_0.1_RandomForest',
                          'f1' : f1,
                          'roc_auc' : roc,
                        'recall' : rec,
                          'precision' : prc}, ignore_index=True)

Classification results:
f1: 36.36%
roc: 51.44%
recall: 65.71%
precision: 25.14%


In [263]:
result_df

Unnamed: 0,approach,f1,roc_auc,recall,precision
0,RandomForest,0.358974,0.589161,0.269231,0.538462
1,PU_0.2_RandomForest,0.431373,0.637173,0.77193,0.29932
2,PU_0.5_RandomForest,0.3375,0.636533,0.794118,0.214286
3,PU_0.1_RandomForest,0.363636,0.514351,0.657143,0.251366
