# **1. Carga de datos**

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
df_train = pd.read_csv('./respuestas_cleaned.csv')

In [5]:
df_train.head()

Unnamed: 0,Sexo,Edad,Fumador,dedos amarillos,Ansiedad,presion de grupo,enfermedad cronica,fatiga,Alergia,Sibilancias,Consumo Alcohol,Tos,Dificultad respirar,Dificultad tragar,Dolor en pecho,Cancer de pulmon
0,1,57,1,1,2,2,1,1,2,1,2,2,2,2,2,1
1,1,82,2,2,2,2,1,1,2,2,2,1,1,2,1,1
2,1,44,1,1,2,2,2,1,2,1,1,1,1,2,2,1
3,1,29,1,1,1,2,1,1,2,2,2,2,1,1,1,0
4,1,41,1,1,2,1,2,1,2,1,1,1,2,1,1,1


# **2. Asignación de variables para los modelos**

Debido a que en la exploración de datos no se encontró ningún patrón entre las variables, los modelos se harán con todas las variables en general para predecir el comportamiento del target que es la variable **Cancer de pulmon**.

In [6]:
# Variable target
y = df_train['Cancer de pulmon']

# Variables independientes
# X = df_train[['Ansiedad', 'Sibilancias', 'Fumador', 'presion de grupo']]
X = df_train.drop('Cancer de pulmon', axis = 1)
print(y.shape, X.shape)

(10000,) (10000, 15)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size = 0.10, random_state = 42)

# **3. Modelos**

Para la prueba de modelos se escogieron los siguientes:
1. Logistic Regression
2. Decision Tree Classifier
3. Random Forest
4. XGBOOST
5. LightGBM

Debido a que al usar estos modelos con paramétros por default, se buscará la mejor hiperparámetrización por Cross Validation.

In [8]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import warnings
warnings.filterwarnings('ignore')

### **1. Logistic Regression**

In [9]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
# logreg.fit(X_train, y_train)

In [10]:
logreg_params = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400, 500],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [11]:
search_obj = RandomizedSearchCV(logreg, param_distributions = logreg_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train, y_train)
print(fit_obj.cv_results_['mean_test_score'])
best_model_logreg = fit_obj.best_estimator_

[0.44799354 0.49495381        nan 0.50368124 0.50368384 0.49495381
 0.50342946        nan 0.33696772        nan]


In [12]:
best_model_logreg.fit(X_train, y_train)
print("Train: ", best_model_logreg.score(X_train, y_train))
print("Test: ", best_model_logreg.score(X_test, y_test))

Train:  0.5134444444444445
Test:  0.499


In [13]:
y_pred_logreg = best_model_logreg.predict(X_test)

In [14]:
accuracy = accuracy_score(y_test, y_pred_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
f1 = f1_score(y_test, y_pred_logreg, average='weighted')

In [15]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.499
F1 Score: 0.49900450936525853
Confusion Matrix:
 [[254 251]
 [250 245]]


### **2. Decision Tree Classifier**

In [16]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
# tree.fit(X_train, y_train)logreg_params

In [17]:
tree_params = {
    'max_depth': [None, 1, 2, 3, 4, 5],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [18]:
search_obj = RandomizedSearchCV(tree, param_distributions = tree_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train, y_train)
print(fit_obj.cv_results_['mean_test_score'])
best_model_tree = fit_obj.best_estimator_

[0.49505905 0.49832614 0.44742862 0.50574798 0.36792371 0.49811329
 0.50626981 0.49505905 0.50066591 0.49991688]


In [19]:
best_model_tree.fit(X_train, y_train)
print("Train: ", best_model_tree.score(X_train, y_train))
print("Test: ", best_model_tree.score(X_test, y_test))

Train:  0.5102222222222222
Test:  0.507


In [20]:
y_pred_tree = best_model_tree.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred_tree)
conf_matrix = confusion_matrix(y_test, y_pred_tree)
f1 = f1_score(y_test, y_pred_tree, average='weighted')

In [22]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.507
F1 Score: 0.5070103535073218
Confusion Matrix:
 [[257 248]
 [245 250]]


### **3. Random Forest**

In [23]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

In [24]:
random_forest_params = {
    'n_estimators': [50, 100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced'],
    'random_state': [42]
}

In [25]:
search_obj = RandomizedSearchCV(random_forest, param_distributions = random_forest_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train, y_train)
print(fit_obj.cv_results_['mean_test_score'])
best_model_random_forest = fit_obj.best_estimator_

[0.49854646 0.50027737 0.5077912  0.5085482  0.50296596 0.50241644
 0.50193668 0.50027737 0.49917658 0.49509403]


In [26]:
best_model_random_forest.fit(X_train, y_train)
print("Train: ", best_model_random_forest.score(X_train, y_train))
print("Test: ", best_model_random_forest.score(X_test, y_test))

Train:  0.9916666666666667
Test:  0.486


In [27]:
y_pred_random_forest = best_model_random_forest.predict(X_test)

In [28]:
accuracy = accuracy_score(y_test, y_pred_random_forest)
conf_matrix = confusion_matrix(y_test, y_pred_random_forest)
f1 = f1_score(y_test, y_pred_random_forest, average='weighted')

In [29]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.486
F1 Score: 0.4856379687768881
Confusion Matrix:
 [[232 273]
 [241 254]]


### **4. XGBOOST**

In [203]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [30]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
# xgb_classifier.fit(X_train, y_train)

In [31]:
xgboost_params = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'n_estimators': [50, 100, 200, 300],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1, 10],
    'reg_lambda': [0, 0.1, 0.5, 1, 10],
    'scale_pos_weight': [1, 2, 3, 4, 5],
    'random_state': [42]
}

In [32]:
search_obj = RandomizedSearchCV(xgb_classifier, param_distributions = xgboost_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train, y_train)
print(fit_obj.cv_results_['mean_test_score'])
best_model_xgb = fit_obj.best_estimator_

[0.35243004 0.33691886 0.50141518 0.33696772 0.33696772 0.33696772
 0.33831848 0.33816939 0.33696772 0.33696772]


In [33]:
best_model_xgb.fit(X_train, y_train)
print("Train: ", best_model_xgb.score(X_train, y_train))
print("Test: ", best_model_xgb.score(X_test, y_test))

Train:  0.7314444444444445
Test:  0.5


In [34]:
y_pred_xgb = best_model_xgb.predict(X_test)

In [35]:
accuracy = accuracy_score(y_test, y_pred_xgb)
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb, average='weighted')

In [36]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5
F1 Score: 0.4991709255194733
Confusion Matrix:
 [[232 273]
 [227 268]]


### **5. LightGBM**

In [37]:
from sklearn.ensemble import HistGradientBoostingClassifier

hist_gradient_boosting = HistGradientBoostingClassifier()
# hist_gradient_boosting.fit(X_train, y_train)

In [38]:
lightgbm_params = {
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'num_leaves': [20, 30, 40, 50],
    'max_depth': [-1, 3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200, 300],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1, 10],
    'reg_lambda': [0, 0.1, 0.5, 1, 10],
    'min_child_samples': [5, 10, 15, 20],
    'random_state': [42]
}

In [39]:
search_obj = RandomizedSearchCV(xgb_classifier, param_distributions = xgboost_params, cv = 5, scoring = 'f1_macro')
fit_obj = search_obj.fit(X_train, y_train)
print(fit_obj.cv_results_['mean_test_score'])
best_model_light_gbm = fit_obj.best_estimator_

[0.44020934 0.50466904 0.33696772 0.33696772 0.33777138 0.36093817
 0.50690919 0.34480125 0.33696772 0.33696772]


In [40]:
best_model_light_gbm.fit(X_train, y_train)
print("Train: ", best_model_light_gbm.score(X_train, y_train))
print("Test: ", best_model_light_gbm.score(X_test, y_test))

Train:  0.5877777777777777
Test:  0.486


In [41]:
y_pred_light_gbm = best_model_light_gbm.predict(X_test)

In [42]:
accuracy = accuracy_score(y_test, y_pred_light_gbm)
conf_matrix = confusion_matrix(y_test, y_pred_light_gbm)
f1 = f1_score(y_test, y_pred_light_gbm, average='weighted')

In [43]:
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.486
F1 Score: 0.48278738371321356
Confusion Matrix:
 [[206 299]
 [215 280]]


A pesar de que en algunos de los modelos sale con un score bastante bueno para el train, no da buenos resultados para el test, por lo que el mejor modelo será el que tiene mejor score en test.

Ahora, se hará la misma limpieza y conversión a los datos que con el dataset de *respuestas.csv*.
Después de ello, se harán las predicciones con el modelo elegido.

**Decision Tree** fue el que mejor evaluación tuvo con el test.
Con un score de **0.512 en el train** y **0.507 en el test**.

# **4. Limpieza de test_data**

In [44]:
import pandas as pd

In [45]:
df_test = pd.read_csv('./datasets/test_data.csv')

In [46]:
df_test.head()

Unnamed: 0,Sexo,Fumador,dedos amarillos,Ansiedad,presion de grupo,enfermedad cronica,fatiga,Alergia,Sibilancias,Consumo Alcohol,Tos,Dificultad respirar,Dificultad tragar,Dolor en pecho,Edad
0,F,2,2,2,2,2,2,1.0,1,2,1,2.0,2.0,1,83.0
1,M,1,2,2,2,1,1,1.0,2,2,2,2.0,2.0,1,71.0
2,F,2,2,2,1,2,1,1.0,1,1,2,1.0,1.0,2,65.0
3,F,1,1,1,1,2,1,2.0,1,1,2,2.0,1.0,1,69.0
4,F,1,1,1,1,1,1,1.0,1,1,1,2.0,2.0,1,55.0


In [47]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Sexo                 1000 non-null   object 
 1   Fumador              1000 non-null   object 
 2   dedos amarillos      1000 non-null   int64  
 3   Ansiedad             1000 non-null   object 
 4   presion de grupo     1000 non-null   int64  
 5   enfermedad cronica   999 non-null    object 
 6   fatiga               1000 non-null   object 
 7   Alergia              999 non-null    float64
 8   Sibilancias          999 non-null    object 
 9   Consumo Alcohol      999 non-null    object 
 10  Tos                  999 non-null    object 
 11  Dificultad respirar  999 non-null    float64
 12  Dificultad tragar    999 non-null    float64
 13  Dolor en pecho       999 non-null    object 
 14  Edad                 999 non-null    float64
dtypes: float64(4), int64(2), object(9)
memo

In [48]:
df_test['Edad'] = df_test['Edad'].replace('[^0-9]', '29', regex = True)
df_test['Edad'].fillna('29', inplace = True)
df_test['Edad'] = df_test['Edad'].astype(int)

In [49]:
df_test['Fumador'] = df_test['Fumador'].replace('[^12]', '2', regex = True)
df_test['Fumador'] = df_test['Fumador'].astype(int)

In [50]:
df_test['Ansiedad'] = df_test['Ansiedad'].replace('[^12]', '2', regex = True)
df_test['Ansiedad'] = df_test['Ansiedad'].astype(int)

In [51]:
df_test['enfermedad cronica'] = df_test['enfermedad cronica'].replace('[^12]', '1', regex = True)
df_test['enfermedad cronica'].fillna('1', inplace = True)
df_test['enfermedad cronica'] = df_test['enfermedad cronica'].astype(int)

In [52]:
df_test['fatiga'] = df_test['fatiga'].replace('[^12]', '1', regex = True)
df_test['fatiga'] = df_test['fatiga'].astype(int)

In [53]:
df_test['Alergia'] = df_test['Alergia'].replace('[^12]', '1', regex=True)
df_test['Alergia'].fillna('1', inplace = True)
df_test['Alergia'] = df_test['Alergia'].astype(int)

In [54]:
df_test['Sibilancias'] = df_test['Sibilancias'].replace('[^12]', '2', regex=True)
df_test['Sibilancias'].fillna('2', inplace = True)
df_test['Sibilancias'] = df_test['Sibilancias'].astype(int)

In [55]:
df_test['Consumo Alcohol'] = df_test['Consumo Alcohol'].replace('[^12]', '1', regex=True)
df_test['Consumo Alcohol'].fillna(1, inplace = True)
df_test['Consumo Alcohol'] = df_test['Consumo Alcohol'].astype(int)

In [56]:
df_test['Tos'] = df_test['Tos'].replace('[^12]', '1', regex=True)
df_test['Tos'].fillna('1', inplace = True)
df_test['Tos'] = df_test['Tos'].astype(int)

In [57]:
df_test['Dificultad respirar'].fillna(1, inplace = True)
df_test['Dificultad respirar'] = df_test['Dificultad respirar'].astype(int)

In [58]:
df_test['Dificultad tragar'].fillna(1, inplace = True)
df_test['Dificultad tragar'] = df_test['Dificultad tragar'].astype(int)

In [59]:
df_test['Dolor en pecho'] = df_test['Dolor en pecho'].replace('[^12]', '1', regex=True)
df_test['Dolor en pecho'].fillna('1', inplace = True)
df_test['Dolor en pecho'] = df_test['Dolor en pecho'].astype(int)

In [60]:
df_test['Sexo'] = df_test['Sexo'].map({'M': 1, 'F': 0}).astype(int)

In [61]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Sexo                 1000 non-null   int32
 1   Fumador              1000 non-null   int32
 2   dedos amarillos      1000 non-null   int64
 3   Ansiedad             1000 non-null   int32
 4   presion de grupo     1000 non-null   int64
 5   enfermedad cronica   1000 non-null   int32
 6   fatiga               1000 non-null   int32
 7   Alergia              1000 non-null   int32
 8   Sibilancias          1000 non-null   int32
 9   Consumo Alcohol      1000 non-null   int32
 10  Tos                  1000 non-null   int32
 11  Dificultad respirar  1000 non-null   int32
 12  Dificultad tragar    1000 non-null   int32
 13  Dolor en pecho       1000 non-null   int32
 14  Edad                 1000 non-null   int32
dtypes: int32(13), int64(2)
memory usage: 66.5 KB


In [62]:
X = df_test[['Sexo', 'Edad', 'Fumador', 'dedos amarillos', 'Ansiedad', 
    'presion de grupo', 'enfermedad cronica', 'fatiga', 'Alergia', 
    'Sibilancias', 'Consumo Alcohol', 'Tos', 'Dificultad respirar', 
    'Dificultad tragar', 'Dolor en pecho']]
y_pred = best_model_tree.predict(X)

In [63]:
print(y_pred[0:20])

[0 0 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1 1]


In [417]:
output = pd.Series(y_pred, name = 'Cancer de pulmon')
output = output.map({1: 'YES', 0: 'NO'}).astype(object)
output.to_csv('test_data_pred.csv', index = False)

In [89]:
from joblib import dump, load

In [93]:
dump(best_model_tree, 'best_model_tree.joblib')

['best_model_tree.joblib']