In [1]:
import pandas as pd 
import numpy as np
from loguru import logger as log

In [6]:
dataframe = pd.read_csv('protein_c_deficiency_0.25.csv')
dataframe_filter = dataframe['THRE'].unique()
#print(dataframe['THRE'].value_counts())

max_mean_f1 = 0
max_thresh = 0
for thresh in dataframe_filter:
    df_model = dataframe[dataframe['THRE'] == thresh]
    mean_f1 = float(f'{np.mean(df_model["F1"]):.2f}')
    if mean_f1 > max_mean_f1:
        max_mean_f1 = mean_f1
        max_thresh = thresh 
        #log.info(f'MODEL Thresh {thresh} with F1: {mean_f1} count: {df_model.shape[0]}')

log.info(f'MODEL Thresh {max_thresh} with F1: {max_mean_f1}')


[32m2023-05-10 14:35:11.659[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mMODEL Thresh 0.4999999999999996 with F1: 0.7[0m


In [17]:
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [15]:
def load_protein_c_v2():
    
    df = pd.read_csv('../../data/protein_c_deficiency_v2.csv', sep='\t')
    label_encoder = LabelEncoder()
    df['res'] = label_encoder.fit_transform(df['res'])
    
    y = df['proc_deficiency'].replace({'No deficiency': 0, 'PROC deficiency': 1})
    x = df.drop(['proc_deficiency', 'node'], axis=1)
    
    return x, y, df

In [64]:
def test_balacing(X, Y, percentage, rs, at='target', sm=False):
    
    X[at] = Y

    size_minority = min(Counter(X[at]).values())
    
    p = np.ceil(size_minority * percentage).astype('int')
    train = []
    test = []
    for classe in X[at].unique():
        
        df_class = X[X[at] == classe]
        
        test.append(df_class.iloc[:p])
        train.append(df_class.iloc[p:])
        
    df_train = pd.concat(train)
    df_test = pd.concat(test)

    # surffle
    df_train = shuffle(df_train, random_state=rs)
    df_test = shuffle(df_test, random_state=rs)
    
    y_train = df_train[at]
    y_test = df_test[at]
        
    x_train = df_train.drop([at], axis=1)
    x_test = df_test.drop([at], axis=1)   

    if sm:
        x_train, y_train = SMOTE().fit_resample(x_train, y_train)
    
    return x_train, y_train, x_test, y_test

In [52]:
def test_balacing_random(X, Y, percentage, rs, at='target', sm=False):
    
    # surffle
    X = shuffle(X, random_state=rs)
    Y = shuffle(Y, random_state=rs)

    X[at] = Y
    size_minority = min(Counter(X[at]).values())
    
    p = np.ceil(size_minority * percentage).astype('int')
    train = []
    test = []
    for classe in X[at].unique():
        
        df_class = X[X[at] == classe]
        
        test.append(df_class.iloc[:p])
        train.append(df_class.iloc[p:])
        
    df_train = pd.concat(train)
    df_test = pd.concat(test)
      
    y_train = df_train[at]
    y_test = df_test[at]
        
    x_train = df_train.drop([at], axis=1)
    x_test = df_test.drop([at], axis=1)   

    if sm:
        x_train, y_train = SMOTE().fit_resample(x_train, y_train)
    
    return x_train, y_train, x_test, y_test

In [68]:
def undersampling(X, Y, percentage, rs, at='target', increase=1, sm=True):
    
    X[at] = Y
    
    # surffle
    X = shuffle(X, random_state=rs)

    #size_minority = min(Counter(X[at]).values())
    proportions = Counter(X[at])

    class_minority = min(proportions, key=proportions.get)
    size_minority  = proportions[class_minority]
    
    p = np.ceil(size_minority * percentage).astype('int')
    p_train = (size_minority - p)
        
    train, test = [], []

    for classe in X[at].unique():
        
        df_class = X[X[at] == classe]

        if classe != class_minority:
            train.append(df_class.iloc[p:(p_train*increase)])
        else:
            train.append(df_class.iloc[p:(p_train)])        
            
        test.append(df_class.iloc[:p])
        #train.append(df_class.iloc[p:p_train])
        
    df_train = pd.concat(train)
    df_test = pd.concat(test)
    
    y_train = df_train[at]
    y_test = df_test[at]
        
    x_train = df_train.drop([at], axis=1)
    x_test = df_test.drop([at], axis=1)   

    if sm:
        x_train, y_train = SMOTE().fit_resample(x_train, y_train)
    
    return x_train, y_train, x_test, y_test


In [72]:
iterations = 20
x, y, _ = load_protein_c_v2()
p=0.1

y_last_iter = pd.Series()
for i in range(iterations):
    x_train_raw, y_train_raw, x_test_raw, y_test_raw = test_balacing_random(x, y, p, i, False)
    
    y_index_iter = pd.Series(y_test_raw.index)
    print(set(y_index_iter) & set(y_last_iter))
    y_last_iter = y_index_iter




set()
{286}
{286}
{84}
{314, 195, 284, 84}
{253, 47}
set()
{294, 74, 141, 30, 318}
{66, 298, 148, 30, 94}
{282, 196}
{282, 98, 127}
{88, 154, 98}
{12, 311}
{220, 67, 12, 276}
{112, 197}
{225, 44}
{299, 211}
{299, 308}
{16}
{8, 307}


In [71]:
iterations = 20
x, y, _ = load_protein_c_v2()
p=0.1

y_last_iter = pd.Series()
for i in range(iterations):
    x_train_raw, y_train_raw, x_test_raw, y_test_raw = test_balacing(x, y, p, i, False)
    
    y_index_iter = pd.Series(y_test_raw.index)
    print(set(y_index_iter) & set(y_last_iter))
    y_last_iter = y_index_iter


set()
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28, 29}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 

In [70]:
iterations = 20
x, y, _ = load_protein_c_v2()
p=0.1

y_last_iter = pd.Series()
for i in range(iterations):
    x_train_raw, y_train_raw, x_test_raw, y_test_raw = undersampling(x, y, p, i, False)
    
    y_index_iter = pd.Series(y_test_raw.index)
    print(set(y_index_iter) & set(y_last_iter))
    y_last_iter = y_index_iter


set()
{286}
{286}
{84}
{314, 195, 284, 84}
{253, 47}
set()
{294, 74, 141, 30, 318}
{66, 298, 148, 30, 94}
{282, 196}
{282, 98, 127}
{88, 154, 98}
{12, 311}
{220, 67, 12, 276}
{112, 197}
{225, 44}
{299, 211}
{299, 308}
{16}
{8, 307}
