# Preprocessing the data

"In any Machine Learning process, Data Preprocessing is that step in which the data gets transformed, or Encoded, to bring it to such a state that now the machine can easily parse it. In other words, the features of the data can now be easily interpreted by the algorithm." - https://towardsdatascience.com/data-preprocessing-concepts-fa946d11c825

## Cleaning the Dataset

##### Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import re # Expresiones regulares
from sklearn import preprocessing # LabelEncoder

###############################StopWords#####################################

from nltk.corpus import stopwords

# Se debe descargar el conjunto de 'Stop Words' la primera vez
import nltk
nltk.download('stopwords')

#Se carga la librería de lematización
import spacy
nlp = spacy.load("en_core_web_sm")

#Se carga la librería de stemming y se inicializa el stemmer
from nltk import SnowballStemmer
snowball = SnowballStemmer('english')

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()

###############################StopWords#####################################

[nltk_data] Downloading package stopwords to C:\Users\Juan
[nltk_data]     José\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.read_table("../../Dataset/labeledEligibilitySample1M.csv", header = None)
uniqueInterventions = 0
uniqueDiagnoses = 0

In [4]:
def takeRandomSamples(n):
    a = data[data[0] == "__label__0"].sample(n)
    b = data[data[0] == "__label__1"].sample(n)
    
    return pd.concat([a,b])

In [5]:
def createDataFrame(table, n, specialCharacters=True):
    df = pd.DataFrame(np.array(table).reshape(n*2,2), columns=['Label', 'Description'])
    
    df['Interventions'], df['Diagnoses'] = df['Description'].str.split('.', 1).str
    df['Eligible'] = df['Label'].str.extract('(\d)', expand=True)

    df = df.drop(['Description'], axis=1)
    df = df.drop(['Label'], axis=1)
    
    df['Interventions'] = df['Interventions'].str.replace("study interventions are\s", "")
    
    # Se transforma todo a minúsculas
    df['Interventions'] = df['Interventions'].str.lower() ##Todo a minúscula
    df['Diagnoses'] = df['Diagnoses'].str.lower()     ##Todo a minúscula

    #Se reemplazan vocales con tildes en ambas variables
    df['Interventions'] = df['Interventions'].str.replace('[áäâà]', 'a', regex=True)
    df['Interventions'] = df['Interventions'].str.replace('[éêèë]', 'a', regex=True)
    df['Interventions'] = df['Interventions'].str.replace('[íïìî]', 'a', regex=True)
    df['Interventions'] = df['Interventions'].str.replace('[óôòö]', 'a', regex=True)
    df['Interventions'] = df['Interventions'].str.replace('[úûùü]', 'a', regex=True)

    df['Diagnoses'] = df['Diagnoses'].str.replace('[áäâà]', 'a', regex=True)
    df['Diagnoses'] = df['Diagnoses'].str.replace('[éêèë]', 'a', regex=True)
    df['Diagnoses'] = df['Diagnoses'].str.replace('[íïìî]', 'a', regex=True)
    df['Diagnoses'] = df['Diagnoses'].str.replace('[óôòö]', 'a', regex=True)
    df['Diagnoses'] = df['Diagnoses'].str.replace('[úûùü]', 'a', regex=True)
    df['Diagnoses'] = df['Diagnoses'].str.replace('^\s', '', regex=True)
    
    if not specialCharacters:
        df['Interventions'] = df['Interventions'].str.replace('[^a-zA-Z# +]', '', regex=True)
        df['Diagnoses'] = df['Diagnoses'].str.replace('[^a-zA-Z# +]', '', regex=True)
    
    return df

In [6]:
def getUniqueInterventions(df):
    return len(np.unique(df['Interventions'])) 

In [7]:
def getUniqueInterventions(df):
    return len(np.unique(df['Diagnoses'])) 

In [8]:
def getInterventionsCount(df):
    return df['Interventions'].value_counts()

In [9]:
def getDiagnosesCount(df):
    return df['Diagnoses'].value_counts()

In [10]:
def transformInterventions(df):
    labels = np.unique(df['Interventions'])

    lb_interventions = preprocessing.LabelEncoder()
    lb_interventions.fit(labels)
    df['Interventions'] = lb_interventions.transform(df['Interventions']) 
    return

In [11]:
def joinColumns(df):
    df['Diagnoses'] = df['Interventions'] + df['Diagnoses']
    df = df.drop(columns=["Interventions","Eligible"])
    return

# Lemmatization

In [12]:
def lemmatize(df, column):
    samples = df[column]

    for i, sample in enumerate(samples):
        nlp_sample = nlp(sample)
        for next_sample in nlp_sample:
            sentence_sample = next_sample
            lemma = sentence_sample.lemma_
            samples[i] = sample
    #samples = [next(nlp(sample).sents).lemma_ for sample in samples]
    
    return samples

# Stemming

https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

In [13]:
def stem(df, column, stemmer='porter'):
    samples = df[column]
    
    if stemmer == 'snowball':
        samples = [snowball.stem(sample) for sample in samples]
    elif stemmer == 'lancaster':
        samples = [lancaster.stem(sample) for sample in samples]
    else: 
        samples = [porter.stem(sample) for sample in samples]
    
    return samples

# Tokenization

In [14]:
def tokenize(df, column):
    return df[column].fillna('').apply(lambda x: x.split())

In [15]:
def detokenize(tokens):
    return ' '.join(tokens)

# Stop Words

In [16]:
def removeStopWords(df, column):
    samples = df[column]
    stop_words = stopwords.words('english')
     
    tokens = tokenize(df, column)
    samples = tokens.apply(lambda x: [item for item in x if item not in stop_words])
    
    for i, sample in enumerate(samples):
        samples[i] = detokenize(sample)
        
    return samples
        
    #return tokens.apply(lambda x: [item for item in x if item not in stop_words])

In [17]:
def saveToCSV(df, name):
    path = '../../Dataset/'
    path += name + '.csv'
    df.to_csv(path, index=False)
    return

### Execution

In [18]:
table = takeRandomSamples(5000)
df = createDataFrame(table, 5000, specialCharacters = False)

In [19]:
df

Unnamed: 0,Interventions,Diagnoses,Eligible
0,adjuvant therapy,breast cancer diagnosis and ecog performance s...,0
1,dacarbazine,childhood central nervous system mixed germ ce...,0
2,bortezomib,unspecified adult solid tumor protocol specifi...,0
3,antibodies monoclonal,unspecified adult solid tumor protocol specifi...,0
4,isophosphamide mustard,fibrohistiocytic neoplasm diagnosis and inelig...,0
...,...,...,...
9995,cyclosporine,recurrent childhood lymphoblastic lymphoma dia...,1
9996,albuminbound paclitaxel,stage iv adenoid cystic carcinoma of the oral ...,1
9997,calcium dietary,adenomas diagnosis and use of phenytoin in the...,1
9998,bevacizumab,patients may have received previous adjuvant c...,1


In [20]:
joinColumns(df)

In [22]:
df['Diagnoses'] = lemmatize(df, 'Diagnoses')

df['Diagnoses'] = stem(df, 'Diagnoses')

df['Diagnoses'] = removeStopWords(df, 'Diagnoses')

saveToCSV(df, '10k_1Col_NoCarEsp_LSA')