***Pré-Processamento***

***Imports***

In [61]:
import pandas as pd
import pickle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

***Get File***

In [62]:
dataFrame = pd.read_csv('../DataBases/CD4DB.csv', sep=';',index_col=None,decimal=',')

***Handle missing values***



***Get mean of ART_ADHER***

In [63]:
adherenceMeanById = dataFrame.groupby('ID')['ART_ADHER'].mean()
dataFrame['ART_ADHER'] = dataFrame['ID'].map(adherenceMeanById)

***Get last instance of each id***

In [64]:
dbUniqueIds = dataFrame.sort_values('CD4_DT2').groupby('ID').last()

***Drop values with missing VL_STATUS for the VlPrediction***

In [65]:
dbUniqueIdsForVl = dbUniqueIds.drop(dbUniqueIds[dbUniqueIds['VL_STATUS'] == 2].index)

***Isolate core attributes***


In [66]:
dbCd4CoreAttributes = dbUniqueIds.loc[:, ['GENDER', 'AGE', 'CD4_BASE', 'VL_BASE', 'ART_ADHER', 'TP_FOLLOW']]
dbCd4CoreAttributes.reset_index(drop=True, inplace=True)
dbVlCoreAttributes = dbUniqueIdsForVl.loc[:, ['GENDER', 'AGE', 'CD4_BASE', 'VL_BASE', 'ART_ADHER', 'TP_FOLLOW']]
dbVlCoreAttributes.reset_index(drop=True, inplace=True)

***Outliers***

In [67]:
for col in dbCd4CoreAttributes.columns:
    threshold = dbCd4CoreAttributes[col].quantile(0.99)
    dbCd4CoreAttributes[col].clip(upper=threshold ,inplace=True)
for col in dbVlCoreAttributes.columns:
    threshold = dbVlCoreAttributes[col].quantile(0.99)
    dbVlCoreAttributes[col].clip(upper=threshold ,inplace=True)

***Get attribute and class values***

In [68]:
attributesCd4 = dbCd4CoreAttributes.values
attributesVl = dbVlCoreAttributes.values
Cd4StatusClassResults = dbUniqueIds['CD4_STATUS']
VlStatusClassResults = dbUniqueIdsForVl['VL_STATUS']

***Training split***

In [69]:
from sklearn.model_selection import train_test_split


Cd4AttributesTraining, Cd4AttributesTest, Cd4ClassResultsTraining, Cd4ClassResultsTest = train_test_split(attributesCd4, Cd4StatusClassResults, test_size=0.2, random_state=42, stratify=Cd4StatusClassResults, shuffle=True)
VlAttributesTraining, Vl4AttributesTest, VlClassResultsTraining, VlClassResultsTest = train_test_split(attributesVl, VlStatusClassResults, test_size=0.2, random_state=42, stratify=VlStatusClassResults, shuffle=True)

***Using NCR + SMOTE to balance the classes and remove noisy***

In [70]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

ncr = NeighbourhoodCleaningRule()
Cd4AttributesTrainingBalanced, Cd4ClassResultsTrainingBalanced = ncr.fit_resample(Cd4AttributesTraining, Cd4ClassResultsTraining)
VlAttributesTrainingBalanced, VlClassResultsTrainingBalanced = ncr.fit_resample(VlAttributesTraining, VlClassResultsTraining)

smote = SMOTE(random_state=42)
Cd4AttributesTrainingBalanced, Cd4ClassResultsTrainingBalanced = smote.fit_resample(Cd4AttributesTraining, Cd4ClassResultsTraining)
VlAttributesTrainingBalanced, VlClassResultsTrainingBalanced = smote.fit_resample(VlAttributesTraining, VlClassResultsTraining)
Cd4ClassResultsTrainingBalanced.value_counts()

1    3742
0    3742
Name: CD4_STATUS, dtype: int64

***Save bases***

In [71]:
with open('../DataBases/Cd4EncodedBase.pkl', 'wb') as f:
    pickle.dump([dbCd4CoreAttributes.columns, Cd4AttributesTrainingBalanced, Cd4ClassResultsTrainingBalanced, Cd4AttributesTest, Cd4ClassResultsTest], f)
with open('../DataBases/VlEncodedBase.pkl', 'wb') as f:
    pickle.dump([dbVlCoreAttributes.columns, VlAttributesTrainingBalanced, VlClassResultsTrainingBalanced, Vl4AttributesTest, VlClassResultsTest], f)
