## Algoritmo: Gradient Tree Boosting

Implementado por: Juan Diego González Gómez

### 1. Instalación e importanción de librerías.


In [1]:
# Librería para manejar las contracciones que se presentan en el inglés.
!pip install contractions



In [2]:
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install pandas-profiling==2.7.1



In [3]:
# librería Natural Language Toolkit, usada para trabajar con textos 
import nltk
# Punkt permite separar un texto en frases.
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Juan
[nltk_data]     Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Descarga todas las palabras vacias, es decir, aquellas que no aportan nada al significado del texto
# ¿Cuales son esas palabras vacías?

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Juan
[nltk_data]     Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Descarga de paquete WordNetLemmatizer, este es usado para encontrar el lema de cada palabra
# ¿Qué es el lema de una palabra? ¿Qué tan dificil puede ser obtenerlo, piensa en el caso en que tuvieras que escribir la función que realiza esta tarea?
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Juan
[nltk_data]     Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Instalación de librerias
import pandas as pd
import numpy as np
import sys
from pandas_profiling import ProfileReport

import re, string, unicodedata
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

### 2. Perfilamiento y entendimiento de los datos



#### Lectura de los datos.

In [7]:
# Uso de la libreria pandas para la lectura de archivos
data=pd.read_csv('clinical_trials_on_cancer_data_clasificacion.csv', sep=',', encoding = 'utf-8')
# Asignación a una nueva variable de los datos leidos
data_t = data

### 3. Preparación de datos

In [8]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def preprocessing(words):
    words = to_lowercase(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

In [9]:
data_t['study_and_condition'] = data_t['study_and_condition'].apply(contractions.fix) #Aplica la corrección de las contracciones
data_t.head()

Unnamed: 0,label,study_and_condition
0,__label__0,study interventions are Saracatinib . recurren...
1,__label__1,study interventions are Stem cell transplantat...
2,__label__0,study interventions are Lenograstim . recurren...
3,__label__0,study interventions are Doxorubicin . stage ii...
4,__label__1,study interventions are Poly I-C . prostate ca...


In [10]:
data_t['words'] = data_t['study_and_condition'].apply(word_tokenize).apply(preprocessing) #Aplica la eliminación del ruido
data_t.head()

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,"[study, interventions, saracatinib, recurrent,..."
1,__label__1,study interventions are Stem cell transplantat...,"[study, interventions, stem, cell, transplanta..."
2,__label__0,study interventions are Lenograstim . recurren...,"[study, interventions, lenograstim, recurrent,..."
3,__label__0,study interventions are Doxorubicin . stage ii...,"[study, interventions, doxorubicin, stage, iii..."
4,__label__1,study interventions are Poly I-C . prostate ca...,"[study, interventions, poly, ic, prostate, can..."


In [11]:
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems + lemmas

data_t['words'] = data_t['words'].apply(stem_and_lemmatize) #Aplica lematización y Eliminación de Prefijos y Sufijos.
data_t.head()

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,"[study, interv, saracatinib, recur, verruc, ca..."
1,__label__1,study interventions are Stem cell transplantat...,"[study, interv, stem, cel, transpl, hodgkin, l..."
2,__label__0,study interventions are Lenograstim . recurren...,"[study, interv, lenograstim, recur, adult, dif..."
3,__label__0,study interventions are Doxorubicin . stage ii...,"[study, interv, doxorubicin, stag, ii, diffus,..."
4,__label__1,study interventions are Poly I-C . prostate ca...,"[study, interv, poly, ic, prost, cant, diagnos..."


In [12]:
data_t['words'] = data_t['words'].apply(lambda x: ' '.join(map(str, x)))
data_t

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,study interv saracatinib recur verruc carcinom...
1,__label__1,study interventions are Stem cell transplantat...,study interv stem cel transpl hodgkin lymphom ...
2,__label__0,study interventions are Lenograstim . recurren...,study interv lenograstim recur adult diffus mi...
3,__label__0,study interventions are Doxorubicin . stage ii...,study interv doxorubicin stag ii diffus larg c...
4,__label__1,study interventions are Poly I-C . prostate ca...,study interv poly ic prost cant diagnos unreso...
...,...,...,...
11995,__label__0,study interventions are Prednisolone hemisucci...,study interv prednisolon hemisuccin recur chil...
11996,__label__0,study interventions are Bevacizumab . recurren...,study interv bevacizumab recur rect cant diagn...
11997,__label__1,"study interventions are Antibodies, Monoclonal...",study interv antibody monoclon recur lymphobla...
11998,__label__0,study interventions are Vorinostat . colorecta...,study interv vorinost colorect cant diagnos pa...


In [13]:
X_data, y_data = data_t['words'],data_t['label']
y_data = (y_data == '__label__1').astype(int)
y_data

0        0
1        1
2        0
3        0
4        1
        ..
11995    0
11996    0
11997    1
11998    0
11999    0
Name: label, Length: 12000, dtype: int32

In [14]:
count = CountVectorizer()
X_count = count.fit_transform(X_data)
print(X_count)
X_count.toarray()[0]

  (0, 9354)	6
  (0, 5213)	1
  (0, 8787)	2
  (0, 8433)	1
  (0, 10565)	1
  (0, 1474)	1
  (0, 5550)	2
  (0, 2676)	1
  (0, 7393)	1
  (0, 6509)	2
  (0, 349)	1
  (0, 10426)	1
  (0, 255)	1
  (0, 1239)	1
  (0, 2137)	2
  (0, 2931)	1
  (0, 7366)	1
  (0, 5581)	2
  (0, 3061)	2
  (0, 10701)	1
  (0, 2787)	1
  (0, 2911)	2
  (0, 5219)	1
  (0, 8435)	1
  (0, 10566)	1
  :	:
  (11998, 10649)	1
  (11998, 10650)	1
  (11999, 9354)	4
  (11999, 5213)	1
  (11999, 2676)	1
  (11999, 7366)	1
  (11999, 10701)	1
  (11999, 5219)	1
  (11999, 2678)	1
  (11999, 7370)	1
  (11999, 10703)	1
  (11999, 7961)	1
  (11999, 7981)	1
  (11999, 1443)	1
  (11999, 1436)	1
  (11999, 7214)	1
  (11999, 7215)	1
  (11999, 9049)	1
  (11999, 5292)	1
  (11999, 9045)	1
  (11999, 5294)	1
  (11999, 3919)	2
  (11999, 273)	1
  (11999, 274)	1
  (11999, 3950)	2


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
X_data

0        study interv saracatinib recur verruc carcinom...
1        study interv stem cel transpl hodgkin lymphom ...
2        study interv lenograstim recur adult diffus mi...
3        study interv doxorubicin stag ii diffus larg c...
4        study interv poly ic prost cant diagnos unreso...
                               ...                        
11995    study interv prednisolon hemisuccin recur chil...
11996    study interv bevacizumab recur rect cant diagn...
11997    study interv antibody monoclon recur lymphobla...
11998    study interv vorinost colorect cant diagnos pa...
11999    study interv freund adjuv ov cant diagnos four...
Name: words, Length: 12000, dtype: object

In [16]:
# Se realiza el ajuste tf-idf
vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(X_data)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
X_tfidf = pd.DataFrame(denselist, columns=feature_names)

print(X_tfidf.shape)
print(X_tfidf)



(12000, 10842)
        01  01910na  025   05   09  0three_two9  0two_two009  \
0      0.0      0.0  0.0  0.0  0.0          0.0          0.0   
1      0.0      0.0  0.0  0.0  0.0          0.0          0.0   
2      0.0      0.0  0.0  0.0  0.0          0.0          0.0   
3      0.0      0.0  0.0  0.0  0.0          0.0          0.0   
4      0.0      0.0  0.0  0.0  0.0          0.0          0.0   
...    ...      ...  ...  ...  ...          ...          ...   
11995  0.0      0.0  0.0  0.0  0.0          0.0          0.0   
11996  0.0      0.0  0.0  0.0  0.0          0.0          0.0   
11997  0.0      0.0  0.0  0.0  0.0          0.0          0.0   
11998  0.0      0.0  0.0  0.0  0.0          0.0          0.0   
11999  0.0      0.0  0.0  0.0  0.0          0.0          0.0   

       10deazaaminopterin   11   12  ...  zoledron  zoledronate  zoledronic  \
0                     0.0  0.0  0.0  ...       0.0          0.0         0.0   
1                     0.0  0.0  0.0  ...       0.0        

In [17]:
# Se muestra el resultado del ajuste tf-idf
for name, values in X_tfidf.iteritems():
    if values[0] > 0: 
        print('{name}: {value}'.format(name=name, value=values[0]))

adequ: 0.12965976002169025
adequate: 0.1372633866399548
agr: 0.14964924026862034
agree: 0.14664915068691026
bir: 0.16200504049011225
birth: 0.16200504049011225
carcinom: 0.08174948872193756
carcinoma: 0.08167389244291497
control: 0.26720891062703844
diagnos: 0.027498366908471985
diagnosis: 0.027546640207763172
discontinu: 0.15186321825308002
discontinuation: 0.18452351066678774
drug: 0.22959917033401986
dur: 0.15661068964598013
duration: 0.15661068964598013
eight: 0.3168202751152618
interv: 0.025721615422542074
interventions: 0.025721615422542074
larynx: 0.3160800719729442
least: 0.22174740199709933
must: 0.17866549979123003
particip: 0.11649613064860295
participation: 0.14058127234804868
patients: 0.07191379958336269
paty: 0.06593768834261601
recur: 0.07878519408194237
recurrent: 0.07934790350329833
saracatinib: 0.41128091339302836
study: 0.15432969253525244
us: 0.10558431701114494
use: 0.10626251565141444
verruc: 0.1549289394569564
verrucous: 0.1549289394569564
week: 0.10185258562527

## 4. Creación y Aplicación del modelo
Una vez se tiene la representación vectorial de cada uno de los textos, se lleva a cabo la tarea de clasificación, que en este caso será binaria debido al rango de la variable objetivo.

In [18]:
# Dividir los datos en entrenamiento y test
X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf, y_data, test_size=0.2, random_state=0)  # Semilla: 0

In [19]:
#parameters = {
#    "n_estimators":[80,100,120],
#    "max_depth":[2,3,4],
#}

# Se seleccionaron los mejores parámetros debido al gran tiempo de ejecución que toma la búsqueda
parameters = {
    "n_estimators":[100],
    "max_depth":[3],
    "learning_rate":[0.1]
}

In [20]:
# Se crea el modelo, se entrena, y se realizan las predicciones de los datos de prueba
results = GridSearchCV(GradientBoostingClassifier(random_state=0),parameters,cv=5)
results.fit(X_train, Y_train)
print(f'Best parameters are: {results.best_params_}')

Best parameters are: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}


In [21]:
# Obtener el mejor modelo.
modelo_final = results.best_estimator_
# Probemos ahora este modelo sobre test.
y_pred_test = modelo_final.predict(X_test)
print(classification_report(Y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.78      0.75      0.77      1228
           1       0.75      0.78      0.77      1172

    accuracy                           0.77      2400
   macro avg       0.77      0.77      0.77      2400
weighted avg       0.77      0.77      0.77      2400

