***Pré-Processamento***

***Imports***

In [21]:
import pandas as pd
import pickle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

***Get File***

In [22]:
dataFrame = pd.read_csv('../DataBases/CD4DB.csv', sep=';',index_col=None,decimal=',')

***Handle missing values***



***Get mean of ART_ADHER***

In [23]:
adherenceMeanById = dataFrame.groupby('ID')['ART_ADHER'].mean()
dataFrame['ART_ADHER'] = dataFrame['ID'].map(adherenceMeanById)

***Get last instance of each id***

In [24]:
dbUniqueIds = dataFrame.sort_values('CD4_DT2').groupby('ID').last()

***Drop values with missing VL_STATUS for the VlPrediction***

In [25]:
dbUniqueIdsForVl = dbUniqueIds.drop(dbUniqueIds[dbUniqueIds['VL_STATUS'] == 2].index)

***Isolate core attributes***


In [26]:
dbCd4CoreAttributes = dbUniqueIds.loc[:, ['GENDER', 'AGE', 'CD4_BASE', 'VL_BASE', 'ART_ADHER', 'TP_FOLLOW']]
dbCd4CoreAttributes.reset_index(drop=True, inplace=True)
dbVlCoreAttributes = dbUniqueIdsForVl.loc[:, ['GENDER', 'AGE', 'CD4_BASE', 'VL_BASE', 'ART_ADHER', 'TP_FOLLOW']]
dbVlCoreAttributes.reset_index(drop=True, inplace=True)

***Get attribute and class values***

In [27]:
attributesCd4 = dbCd4CoreAttributes.values
attributesVl = dbVlCoreAttributes.values
Cd4StatusClassResults = dbUniqueIds['CD4_STATUS']
VlStatusClassResults = dbUniqueIdsForVl['VL_STATUS']

***Using SMOTE(overSampling) to balance the classes***

In [29]:
sm = SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=42)
attributesCd4Balanced, Cd4StatusClassResultsBalanced = sm.fit_resample(attributesCd4, Cd4StatusClassResults)
attributesVlBalanced, VlStatusClassResultsBalanced = sm.fit_resample(attributesVl, VlStatusClassResults)

***Using TomekLinks(undersampling) to balance the classes***

In [37]:
rus = RandomUnderSampler(random_state=42)
attributesCd4Balanced, Cd4StatusClassResultsBalanced = rus.fit_resample(attributesCd4, Cd4StatusClassResults)
attributesVlBalanced, VlStatusClassResultsBalanced = rus.fit_resample(attributesVl, VlStatusClassResults)


***Save bases***

In [39]:
with open('../DataBases/Cd4EncodedBase.pkl', 'wb') as f:
    pickle.dump([dbCd4CoreAttributes.columns, attributesCd4, Cd4StatusClassResults], f)
with open('../DataBases/VlEncodedBase.pkl', 'wb') as f:
    pickle.dump([dbVlCoreAttributes.columns, attributesVl, VlStatusClassResults], f)
with open('../DataBases/Cd4EncodedBaseBalanced.pkl', 'wb') as f:
    pickle.dump([dbCd4CoreAttributes.columns, attributesCd4Balanced, Cd4StatusClassResultsBalanced], f)
with open('../DataBases/VlEncodedBaseBalanced.pkl', 'wb') as f:
    pickle.dump([dbVlCoreAttributes.columns, attributesVlBalanced, VlStatusClassResultsBalanced], f)