# Sprint 1 

##Preparación de datos

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
df = pd.read_csv('penguins.csv')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Esta instrucción ayuda a comprobar si un DataFrame contiene valores nulos en cualquier atributo
df.isnull().values.any()

True

Eliminación de **valores nulos**:

In [8]:
df = pd.read_csv('penguins.csv').dropna()
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male
...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male


In [9]:
# Esta instrucción ayuda a comprobar si un DataFrame contiene valores nulos en cualquier atributo
df.isnull().values.any()

False

In [10]:
from sklearn.model_selection import train_test_split

X = df.drop('species',axis=1)
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=.2, random_state=42)

Imputación de **valores perdidos** en atributos numéricos:

In [11]:
X_train_num = X_train.drop(["island", "sex"], axis=1) # Solo atributos numéricos para rellenar valores perdidos

from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy="median") # Rellenar valores perdidos de atributos numéricos con la mediana de éste

X_train_num_array = num_imputer.fit_transform(X_train_num)
X_train_num = pd.DataFrame(X_train_num_array, columns=X_train_num.columns, index=X_train_num.index) # Al aplicar una imputación se pierde la estructura de DataFrame, pero se puede volver a crear
X_train_num.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
232,49.1,14.5,212.0,4625.0
84,37.3,17.8,191.0,3350.0
306,40.9,16.6,187.0,3200.0
22,35.9,19.2,189.0,3800.0
29,40.5,18.9,180.0,3950.0


Manejo del atributo categóricos *island* mediante **OneHotEncoder**:

In [12]:
X_train[["island", 'sex']].head()

Unnamed: 0,island,sex
232,Biscoe,female
84,Dream,female
306,Dream,female
22,Biscoe,female
29,Biscoe,male


In [13]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoder.fit(X_train[["island", 'sex']])
X_train_encoded = encoder.transform(X_train[["island",'sex']]).toarray()

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names(X_train[["island",'sex']].columns))

X_train_encoded_df.head()



Unnamed: 0,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0


Una vez hemos realizado el tratamiento de **atributos numéricos** y **atributo categórico** de forma separada, los unificamos en una nueva versión de datos de entrenamiento. Como *OneHotEncoder* origina nuevos índices asociados a las instancias hay que **resetear los índices de ambas partes** a fusionar:

In [14]:
X_train_num.reset_index(drop=True, inplace=True) # Resetear índices de las instancias para asegurar coherencia al fusionar los datos

X_train_encoded_df.reset_index(drop=True, inplace=True) # Resetear índices de las instancias para asegurar coherencia al fusionar los datos

X_train_prepared = pd.concat([X_train_num, X_train_encoded_df], axis=1) # Se fusionan todos los atributos necesarios

X_train_prepared

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,49.1,14.5,212.0,4625.0,1.0,0.0,0.0,1.0,0.0
1,37.3,17.8,191.0,3350.0,0.0,1.0,0.0,1.0,0.0
2,40.9,16.6,187.0,3200.0,0.0,1.0,0.0,1.0,0.0
3,35.9,19.2,189.0,3800.0,1.0,0.0,0.0,1.0,0.0
4,40.5,18.9,180.0,3950.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
261,49.6,15.0,216.0,4750.0,1.0,0.0,0.0,0.0,1.0
262,37.2,19.4,184.0,3900.0,0.0,0.0,1.0,0.0,1.0
263,39.7,17.7,193.0,3200.0,1.0,0.0,0.0,1.0,0.0
264,45.2,17.8,198.0,3950.0,0.0,1.0,0.0,1.0,0.0


Para realizar predicciones sobre los **datos de test** necesitaremos aplicar la misma secuencia de transformaciones que originalmente aplicamos sobre los datos de entrenamiento.

En los datos de **test** usamos el mismo transformador *OneHotEncoder* definido previamente para los datos de entrenamiento, invocando directamente el método *transform()*: no se debe llamar el método *fit()* de nuevo, ya que el modo de ***transformar los datos*** debe ser ***según lo aprendido de los datos de entrenamiento***.

In [15]:
X_test_encoded = encoder.transform(X_test[["island",'sex']]).toarray() # Aplicar transformador OneHotEncoder

X_test_encoded_df = pd.DataFrame(X_test_encoded,
                                 columns=encoder.get_feature_names(X_test[["island",'sex']].columns))

X_test_num = X_test.drop(["island","sex"], axis=1)

X_test_num_array = num_imputer.transform(X_test_num) # Aplicar transformador para imputar valores numéricos perdidos
X_test_num = pd.DataFrame(X_test_num_array, columns=X_test_num.columns, index=X_test_num.index) 
# Al aplicar una imputación se pierde la estructura de DataFrame, pero se puede volver a crear

# Resetear indices en atributos numéricos y en los binarios (derivados del atributo categórico), antes de fusionarlos.
X_test_num.reset_index(drop=True, inplace=True)
X_test_encoded_df.reset_index(drop=True, inplace=True)

X_test_prepared = pd.concat([X_test_num,X_test_encoded_df], axis=1) # Se fusionan todos los atributos necesarios
X_test_prepared



Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.5,16.7,178.0,3250.0,0.0,1.0,0.0,1.0,0.0
1,50.9,17.9,196.0,3675.0,0.0,1.0,0.0,1.0,0.0
2,42.1,19.1,195.0,4000.0,0.0,0.0,1.0,0.0,1.0
3,46.6,14.2,210.0,4850.0,1.0,0.0,0.0,1.0,0.0
4,41.1,18.2,192.0,4050.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
62,50.5,19.6,201.0,4050.0,0.0,1.0,0.0,0.0,1.0
63,36.7,19.3,193.0,3450.0,0.0,0.0,1.0,1.0,0.0
64,35.1,19.4,193.0,4200.0,0.0,0.0,1.0,0.0,1.0
65,50.1,17.9,190.0,3400.0,0.0,1.0,0.0,1.0,0.0


## SOFTMAX

En *scikit-learn*, un **modelo de regresión softmax** es entrenado con la misma clase de regresión logística, *LogisticRegression*, con dos ajustes sencillos:

1.   Fijando el hiperparámetro *multi_class="multinomial"*:
2.   Usando un enfoque de optimización subyacente (*solver*) compatible con regresión softmax, por ejemplo "*lbfgs*" (por defecto).



In [16]:
from sklearn.linear_model import LogisticRegression

softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000)
softmax_reg.fit(X_train_prepared, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [17]:
X_test_prepared.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
count,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0
mean,43.649254,17.531343,199.208955,4105.597015,0.41791,0.373134,0.208955,0.537313,0.462687
std,5.685958,1.941197,12.911749,779.056112,0.496938,0.487288,0.409631,0.502369,0.502369
min,34.6,13.2,178.0,2900.0,0.0,0.0,0.0,0.0,0.0
25%,38.4,16.25,190.0,3475.0,0.0,0.0,0.0,0.0,0.0
50%,42.9,17.9,196.0,3900.0,0.0,0.0,0.0,1.0,0.0
75%,49.05,18.95,210.0,4475.0,1.0,1.0,0.0,1.0,1.0
max,54.2,21.1,230.0,6300.0,1.0,1.0,1.0,1.0,1.0


In [18]:
mean = {'bill_length_mm': [43.649254],
        'bill_depth_mm': [17.531343],
        'flipper_length_mm': [199.208955],
        'body_mass_g': [4105.597015],
        'island_Biscoe': [0.417910],
        'island_Dream': [0.373134],
        'island_Torgersen': [0.208955],
        'sex_female':[0.537313],
        'sex_male':[0.462687]}

mean = pd.DataFrame(mean)

softmax_reg.predict(mean)


array(['Adelie'], dtype=object)

In [19]:
softmax_reg.predict_proba(mean)


array([[0.88941569, 0.03811566, 0.07246865]])

In [20]:
score = softmax_reg.score(X_train_prepared, y_train)
score

0.9962406015037594

## SVM (Support Vector Classification)

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC # SVC: Support Vector Classification
from sklearn.metrics import accuracy_score

In [22]:
svm_classifier = Pipeline([
                           ("scaler", StandardScaler()),
                           ("linear_svc", LinearSVC(C=0.1, loss="hinge", max_iter=10000)),
])
# Definir una secuencia de acciones (pipeline) sencilla, consistente en entrenar un modelo SVM precedido de un escalado de los atributos.

svm_classifier.fit(X_train_prepared, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=0.1, loss='hinge', max_iter=10000))])

In [23]:
y_pred = svm_classifier.predict(X_test_prepared)
accuracy_score(y_test, y_pred)

1.0

Vemos que tanto SVM como Softmax nos dan resultados excelentes. Tenemos que resaltar aquí la importancia del tratamiento de los datos, gracias a:
* Eliminar valores nulos
* Imputar valores perdidos
* Tratar vatiables categóricas con un approach OneHoteEncoder
* Resetear los índices
* Unificar los datos con pandas

Nuestros modelos muestran una puntuación y precisión notables. Si tuviéramos que elegir, nos quedaríamos con SVM ya que tiene una precisión del 100%.


# Sprint 2

In [24]:
from sklearn.model_selection import train_test_split

X = df.select_dtypes(exclude=['object'])
y = df.species

X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=.2, random_state=42)

In [25]:
from sklearn.tree import DecisionTreeClassifier
tree_1_noHip = DecisionTreeClassifier(random_state=42)
tree_1_noHip.fit(X_train, y_train)
tree_2_split = DecisionTreeClassifier(min_samples_split=12, random_state=42)
tree_2_split.fit(X_train, y_train)
tree_3_leaf = DecisionTreeClassifier(min_samples_leaf=6, random_state=42)
tree_3_leaf.fit(X_train, y_train)
tree_4_features = DecisionTreeClassifier(max_features=2, random_state=42)
tree_4_features.fit(X_train, y_train)
tree_5_depth = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_5_depth.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=42)

## Decision Tree 1 

In [26]:
from sklearn.metrics import accuracy_score
y_pred1_tr = tree_1_noHip.predict(X_train)
accuracy_score(y_pred1_tr, y_train)
y_pred1 = tree_1_noHip.predict(X_test)
accuracy_score(y_pred1, y_test)

1.0

## Decision Tree 2

In [27]:
y_pred2_tr = tree_2_split.predict(X_train)
accuracy_score(y_pred2_tr, y_train)
y_pred2 = tree_2_split.predict(X_test)
accuracy_score(y_pred2, y_test)

0.9701492537313433

## Decision Tree 3

In [28]:
y_pred3_tr = tree_3_leaf.predict(X_train)
accuracy_score(y_pred3_tr, y_train)
y_pred3 = tree_3_leaf.predict(X_test)
accuracy_score(y_pred3, y_test)


0.9701492537313433

## Decision Tree 4

In [29]:
y_pred4_tr = tree_4_features.predict(X_train)
accuracy_score(y_pred4_tr, y_train)
y_pred4 = tree_4_features.predict(X_test)
accuracy_score(y_pred4, y_test)

0.9402985074626866

## Decision Tree 5

In [30]:
y_pred5_tr = tree_5_depth.predict(X_train)
accuracy_score(y_pred5_tr, y_train)
y_pred5 = tree_5_depth.predict(X_test)
accuracy_score(y_pred5, y_test)

0.9701492537313433

Vemos que los cambios realizados sobre depth, leaf y split tuvieron los mismos resultados, lo cual quiere decir que llegaron a una "child" de clasificación similar. El Decision Tree con mejor accuacy es el que no ha tenido modificación en sus hiperparámetros.

# Sprint 3

Ensamble scikit-lear (elegir 2)

* BaggingClassifier
* AdaBoostClassifier
* GradientBoostingClassifier
* RandomForestClassifier

##BaggingClasifier

In [55]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)

bagging_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1)

In [56]:
y_pred10 = bagging_clf.predict(X_test[:10])
y_pred10

array(['Adelie', 'Chinstrap', 'Adelie', 'Gentoo', 'Adelie', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Gentoo'], dtype=object)

In [33]:
y_test[:10]

30        Adelie
320    Chinstrap
79        Adelie
202       Gentoo
63        Adelie
307    Chinstrap
292    Chinstrap
187       Gentoo
219       Gentoo
204       Gentoo
Name: species, dtype: object

In [34]:
from sklearn.metrics import confusion_matrix

y_pred = bagging_clf.predict(X_test)

confusion_matrix(y_pred, y_test)

array([[31,  2,  0],
       [ 0, 16,  0],
       [ 0,  0, 18]])

In [35]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_pred, y_test)

0.9701492537313433

## XClasifier

In [41]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male


In [49]:
y_train.head()

232       Gentoo
84        Adelie
306    Chinstrap
22        Adelie
29        Adelie
Name: species, dtype: object

In [52]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf_clf.fit(X_train_prepared,y_train)

RandomForestClassifier(n_estimators=50, n_jobs=-1)

In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix

predictions_tr = rf_clf.predict(X_test_prepared)
accuracy_score(y_test, predictions_tr)

1.0

Hemos tenido buenas puntuaciones con varios modelos de aprendizaje a lo largo del ejercicio. Pero los mejores y que han llegado que se han acomodado a la perfección han sido:
- Decision Tree 1
- RandomForest
- SVM

In [61]:
%%shell
jupyter nbconvert --to html /////content/Proyecto_AS2-GDCA.ipynb

[NbConvertApp] Converting notebook /////content/Proyecto_AS2-GDCA.ipynb to html
[NbConvertApp] Writing 376785 bytes to /////content/Proyecto_AS2-GDCA.html


