In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tabulate import tabulate
from collections import Counter

import warnings
warnings.filterwarnings(action="ignore")

In [37]:
data_old = pd.read_csv('Data\data_decode_old.csv', index_col=0)
data_old.head()

Unnamed: 0_level_0,gender,age,bmi,ao,activity,smoking,alcohol,male_heredity,stage
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,1,0,1,0,1,1,0,1
2,1,0,1,1,0,0,0,1,1
3,0,1,0,1,0,1,0,0,1
4,0,1,0,0,0,0,0,1,0
5,1,1,1,1,1,1,1,1,1


In [38]:
data_new = pd.read_csv('Data\data_decode_new.csv', index_col=0)
data_new.head()

Unnamed: 0_level_0,gender,age,bmi,ao,activity,smoking,alcohol,male_heredity,stage
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,1,0,0,1,1,1,0,1
2,1,0,0,1,0,0,0,1,1
3,0,1,1,1,1,0,0,0,1
4,0,1,0,1,1,0,0,1,0
5,1,1,0,1,1,1,1,1,1


In [41]:
data_combine = data_old.join(data_new.rename(lambda x: x+'_new', axis=1), how='right')

In [42]:
data_combine.head()

Unnamed: 0_level_0,gender,age,bmi,ao,activity,smoking,alcohol,male_heredity,stage,gender_new,age_new,bmi_new,ao_new,activity_new,smoking_new,alcohol_new,male_heredity_new,stage_new
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,1,0,1,0,1,1,0,1,1,1,0,0,1,1,1,0,1
2,1,0,1,1,0,0,0,1,1,1,0,0,1,0,0,0,1,1
3,0,1,0,1,0,1,0,0,1,0,1,1,1,1,0,0,0,1
4,0,1,0,0,0,0,0,1,0,0,1,0,1,1,0,0,1,0
5,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1


# Check rates

In [13]:
def get_rate_old_new(data, col_name):
    rate_old = sum(data[col_name]) / len(data)
    rate_new = sum(data[col_name+'_new']) / len(data)
    return rate_old, rate_new

In [14]:
def get_info_rate(data, col_names):
    table = []    
    for col in col_names:
        rate_old, rate_new = get_rate_old_new(data, col)
        table += [[col, rate_old, rate_new]]
    print(tabulate(table, headers=['', 'Old', 'New'], tablefmt='fancy_grid'))

In [16]:
get_info_rate(data_combine, data_old.columns[:-1])

╒═══════════════╤══════════╤══════════╕
│               │      Old │      New │
╞═══════════════╪══════════╪══════════╡
│ gender        │ 0.494737 │ 0.491228 │
├───────────────┼──────────┼──────────┤
│ age           │ 0.610526 │ 0.77193  │
├───────────────┼──────────┼──────────┤
│ bmi           │ 0.284211 │ 0.326316 │
├───────────────┼──────────┼──────────┤
│ ao            │ 0.470175 │ 0.540351 │
├───────────────┼──────────┼──────────┤
│ activity      │ 0.585965 │ 0.729825 │
├───────────────┼──────────┼──────────┤
│ smoking       │ 0.368421 │ 0.361404 │
├───────────────┼──────────┼──────────┤
│ alcohol       │ 0.266667 │ 0.277193 │
├───────────────┼──────────┼──────────┤
│ male_heredity │ 0.364912 │ 0.350877 │
╘═══════════════╧══════════╧══════════╛


# Check transitions
##### (yes->yes, no->no, yes->no, no->yes)

Number of transitions in sick:

In [18]:
pd.crosstab(data_combine.stage, data_combine.stage_new,
            rownames=['Before'],colnames=['After'])

After,0,1
Before,Unnamed: 1_level_1,Unnamed: 2_level_1
0,95,31
1,0,159


In [19]:
def get_transmission(data, col_name):
    old = data[col_name]
    new = data[col_name+'_new']
    
    one_one = sum(old & new)
    zero_zero = len(data) - sum(old | new)
    one_zero = sum((old ^ new) & old)
    zero_one = sum((old ^ new) & new)
    
    return {'1-1': one_one,
            '1-0': one_zero,
            '0-0': zero_zero,
            '0-1': zero_one}

In [20]:
def get_info_transition(data, col_names):
    table = []
    for col in col_names:
        dict_trans = get_transmission(data, col)
        table += [[col, dict_trans['1-1'], dict_trans['1-0'], dict_trans['0-1'], dict_trans['0-0']]]
    print(f'Total number: {len(data)}')
    print(tabulate(table, headers=['', '1-1', '1-0', '0-1', '0-0'], tablefmt='fancy_grid'))    

### 1. Stay healthy

In [21]:
get_info_transition(data_combine.query('stage==0 and stage_new==0'), data_old.columns[:-2])

Total number: 95
╒══════════╤═══════╤═══════╤═══════╤═══════╕
│          │   1-1 │   1-0 │   0-1 │   0-0 │
╞══════════╪═══════╪═══════╪═══════╪═══════╡
│ gender   │    37 │     2 │     1 │    55 │
├──────────┼───────┼───────┼───────┼───────┤
│ age      │    42 │     0 │    19 │    34 │
├──────────┼───────┼───────┼───────┼───────┤
│ bmi      │     7 │     1 │     6 │    81 │
├──────────┼───────┼───────┼───────┼───────┤
│ ao       │    15 │     4 │    14 │    62 │
├──────────┼───────┼───────┼───────┼───────┤
│ activity │    45 │    14 │    28 │     8 │
├──────────┼───────┼───────┼───────┼───────┤
│ smoking  │    28 │     6 │     3 │    58 │
├──────────┼───────┼───────┼───────┼───────┤
│ alcohol  │    11 │     7 │    15 │    62 │
╘══════════╧═══════╧═══════╧═══════╧═══════╛


### 2. Stay sick

In [23]:
get_info_transition(data_combine.query('stage==1 and stage_new==1'), data_old.columns[:-2])

Total number: 159
╒══════════╤═══════╤═══════╤═══════╤═══════╕
│          │   1-1 │   1-0 │   0-1 │   0-0 │
╞══════════╪═══════╪═══════╪═══════╪═══════╡
│ gender   │    83 │     0 │     0 │    76 │
├──────────┼───────┼───────┼───────┼───────┤
│ age      │   111 │     0 │    25 │    23 │
├──────────┼───────┼───────┼───────┼───────┤
│ bmi      │    57 │     7 │    12 │    83 │
├──────────┼───────┼───────┼───────┼───────┤
│ ao       │    85 │    13 │    21 │    40 │
├──────────┼───────┼───────┼───────┼───────┤
│ activity │    69 │    18 │    45 │    27 │
├──────────┼───────┼───────┼───────┼───────┤
│ smoking  │    56 │     5 │     5 │    93 │
├──────────┼───────┼───────┼───────┼───────┤
│ alcohol  │    33 │    15 │    13 │    98 │
╘══════════╧═══════╧═══════╧═══════╧═══════╛


### 3. Was healthy, now - sick

In [25]:
get_info_transition(data_combine.query('stage==0 and stage_new==1'), data_old.columns[:-2])

Total number: 31
╒══════════╤═══════╤═══════╤═══════╤═══════╕
│          │   1-1 │   1-0 │   0-1 │   0-0 │
╞══════════╪═══════╪═══════╪═══════╪═══════╡
│ gender   │    19 │     0 │     0 │    12 │
├──────────┼───────┼───────┼───────┼───────┤
│ age      │    20 │     1 │     3 │     7 │
├──────────┼───────┼───────┼───────┼───────┤
│ bmi      │     8 │     1 │     3 │    19 │
├──────────┼───────┼───────┼───────┼───────┤
│ ao       │    16 │     1 │     3 │    11 │
├──────────┼───────┼───────┼───────┼───────┤
│ activity │    14 │     7 │     7 │     3 │
├──────────┼───────┼───────┼───────┼───────┤
│ smoking  │     7 │     3 │     4 │    17 │
├──────────┼───────┼───────┼───────┼───────┤
│ alcohol  │     7 │     3 │     0 │    21 │
╘══════════╧═══════╧═══════╧═══════╧═══════╛


### 4. Was sick, now - healthy

In [26]:
get_info_transition(data_combine.query('stage==1 and stage_new==0'), data_old.columns[:-2])

Total number: 0
╒══════════╤═══════╤═══════╤═══════╤═══════╕
│          │   1-1 │   1-0 │   0-1 │   0-0 │
╞══════════╪═══════╪═══════╪═══════╪═══════╡
│ gender   │     0 │     0 │     0 │     0 │
├──────────┼───────┼───────┼───────┼───────┤
│ age      │     0 │     0 │     0 │     0 │
├──────────┼───────┼───────┼───────┼───────┤
│ bmi      │     0 │     0 │     0 │     0 │
├──────────┼───────┼───────┼───────┼───────┤
│ ao       │     0 │     0 │     0 │     0 │
├──────────┼───────┼───────┼───────┼───────┤
│ activity │     0 │     0 │     0 │     0 │
├──────────┼───────┼───────┼───────┼───────┤
│ smoking  │     0 │     0 │     0 │     0 │
├──────────┼───────┼───────┼───────┼───────┤
│ alcohol  │     0 │     0 │     0 │     0 │
╘══════════╧═══════╧═══════╧═══════╧═══════╛


# Most Frequent combination

In [27]:
def get_freq_table(data):
    counter = Counter(list(map(tuple, data.to_numpy())))
    table = pd.DataFrame(counter.items(), columns=['t', 'freq'])
    table = table.join(pd.DataFrame(map(list, table.t.to_numpy()), columns=data.columns))
    table.drop(columns=['t'], inplace=True)
    return table

### Most frequent new

In [45]:
freq_new = get_freq_table(data_new)
freq_new.sort_values('freq', ascending=False).head(10)

Unnamed: 0,freq,gender,age,bmi,ao,activity,smoking,alcohol,male_heredity,stage
6,12,0,1,0,1,1,0,0,0,1
2,11,0,1,1,1,1,0,0,0,1
36,9,0,1,0,0,1,0,0,0,0
10,9,0,1,0,0,1,0,0,0,1
49,8,0,1,0,1,1,0,0,1,1
47,7,0,1,1,1,1,0,0,1,1
41,5,1,1,0,0,1,0,0,0,1
19,5,1,1,0,0,1,0,0,0,0
12,5,1,1,1,1,1,1,0,0,1
0,4,1,1,0,0,1,1,1,0,1


### Most frequent old

In [29]:
freq_old = get_freq_table(data_old)
freq_old.sort_values('freq', ascending=False).head(10)

Unnamed: 0,freq,gender,age,bmi,ao,activity,smoking,alcohol,male_heredity,stage
38,10,0,1,0,0,1,0,0,0,0
49,8,0,1,1,1,0,0,0,1,1
14,8,0,1,1,1,1,0,0,0,1
10,7,0,1,0,0,1,0,0,0,1
60,6,0,1,0,1,1,0,0,0,1
8,6,0,0,0,0,1,0,0,0,0
86,5,0,0,0,0,0,0,0,0,0
118,5,1,1,0,0,1,0,0,0,0
0,4,1,1,0,1,0,1,1,0,1
27,4,1,0,0,0,1,0,0,0,0


### Most frequent at all

In [44]:
freq = get_freq_table(pd.concat([data_old, data_new]))
freq.sort_values('freq', ascending=False).head(10)

Unnamed: 0,freq,gender,age,bmi,ao,activity,smoking,alcohol,male_heredity,stage
38,19,0,1,0,0,1,0,0,0,0
14,19,0,1,1,1,1,0,0,0,1
60,18,0,1,0,1,1,0,0,0,1
10,16,0,1,0,0,1,0,0,0,1
49,12,0,1,1,1,0,0,0,1,1
118,10,1,1,0,0,1,0,0,0,0
43,10,0,1,1,1,1,0,0,1,1
8,10,0,0,0,0,1,0,0,0,0
74,9,0,1,0,1,1,0,0,1,1
13,8,0,1,0,0,1,0,0,1,1
