In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# Librerías para arboles de decision 
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
# Librerias para regresión logística
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
url = 'https://raw.githubusercontent.com/xlisouski/DataCoder/main/Titanic.csv'
df = pd.read_csv(url, sep = ',')

In [None]:
df.head()

In [None]:
# Ojo, la edad tiene valores faltantes:
df['Age'].isnull().value_counts()

In [None]:
# Antes de entrenar, completo valores vacíos de la variable edad: 
edad_mediana = df['Age'].mean()
df['Age_Completa'] = df['Age'].fillna(edad_mediana,
                                      inplace = False)
df['Age_Completa'].isnull().value_counts()

# Arbol de decisión 

In [6]:
# El algoritmo a utilizar en esta instancia no admite valores categóricos. 
# Así que vamos a trabajar con transformaciones numéricas:

# Genero predictores a usar
df['Es_Mujer'] = df['Sex'] == 'female'
df['Es_Clase1'] = df['Pclass'] == 1
df['Es_Clase3'] = df['Pclass'] == 3

# Los enlisto: 
lista_predictores = ['Es_Mujer','Es_Clase1','Es_Clase3','Age_Completa']

# Los transformo en matriz: 
X = df[lista_predictores].values


In [7]:
# Genero vector de target
y = df['Survived'].values

In [8]:
# Genero modelo: árbol de decisión 
clf = DecisionTreeClassifier(random_state = 1234, # Semilla aleatoria
                             criterion = 'gini', # Funcion para medir calidad de la división
                             splitter = 'best', # Estrategia para elegir la división de cada nodo
                             max_depth = 4, # Profundidad máxima del árbol
                             min_samples_leaf = 20 # Mínima cantidad de registros por nodo final
                            )
clf.feature_names = lista_predictores
model = clf.fit(X, y)

In [None]:
plt.figure(figsize=(12, 8))
plot_tree(clf, 
          filled = True, 
          rounded = True, 
          class_names = ['No Sobrevive','Sobrevive'], 
          feature_names = lista_predictores, 
          impurity = False,
          proportion = True,
          node_ids = True)

In [None]:
# Genero predicciones  
df['Predict_Arbol_Clase'] = clf.predict(df[lista_predictores])
df['Predict_Arbol_Nodo'] = clf.apply(df[lista_predictores])

In [None]:
df['Predict_Arbol_Clase'].value_counts()

In [None]:
df['Predict_Arbol_Nodo'].value_counts()

In [None]:
# Groupby por prediccion de clase: 
resultados = df.groupby('Predict_Arbol_Clase').agg(
    Cant = ('Predict_Arbol_Clase', 'count'),              
    Cant_Sup = ('Survived', 'sum'),            
    Tasa_Sup = ('Survived', 'mean')    
).reset_index()
resultados

In [None]:
# Groupby por prediccion de nodo: 
resultados_nodo = df.groupby('Predict_Arbol_Nodo').agg(     
    Cant = ('Predict_Arbol_Nodo', 'count'),              
    Cant_Sup = ('Survived', 'sum'),            
    Tasa_Sup = ('Survived', 'mean')    
).reset_index()
resultados_nodo.rename(columns={'Tasa_Sup': 'Predict_Prob_Arbol'}, inplace=True)
resultados_nodo

In [None]:
# Genero la columna con la predicción 
df = df.merge(resultados_nodo[['Predict_Arbol_Nodo','Predict_Prob_Arbol']],
              left_on = 'Predict_Arbol_Nodo',
              right_on = 'Predict_Arbol_Nodo',
              how = 'left')
df.head()

In [None]:
# Graficamente 
g1 = plt.figure()
sns.scatterplot(data = df,
                x = 'PassengerId',
                y = 'Predict_Prob_Arbol',  
                alpha = 0.75,
                hue = 'Survived')

In [None]:
# Histogramas de predicción 
g1 = plt.figure()
sns.histplot(data = df,
             x = 'Predict_Prob_Arbol',  
             alpha = 0.5,
             hue = 'Survived',
             stat = 'probability', 
             bins = 4)

In [None]:
# Barplot por nodo 
resultados_nodo['Tasa_NoSup'] = (1 - resultados_nodo['Predict_Prob_Arbol'])
resultados_nodo = resultados_nodo.sort_values('Predict_Prob_Arbol')

# Crear el barplot ordenado
sns.barplot(data=resultados_nodo,
            x='Predict_Arbol_Nodo',
            y='Predict_Prob_Arbol', 
            label='Porcentaje Supervivencia',
            color='#48E074',
            order=resultados_nodo['Predict_Arbol_Nodo'])

sns.barplot(data=resultados_nodo, 
            x='Predict_Arbol_Nodo', 
            y='Tasa_NoSup', 
            bottom=resultados_nodo['Predict_Prob_Arbol'], 
            label='Porcentaje No Supervivencia',
            color='#E04848',
            order=resultados_nodo['Predict_Arbol_Nodo'])

# Regresión logística

In [None]:
# Genero el modelo
formula = "Survived ~ Es_Mujer + Es_Clase1 + Es_Clase3 + Age_Completa"
reg_log = smf.logit(formula, data = df).fit()

# Ver el resumen de resultados
print(reg_log.summary())

In [None]:
coeficientes = reg_log.params
coeficientes.round(4)

In [None]:
pvalores = reg_log.pvalues
pvalores.round(6)

In [144]:
# Predicciones 
df['Predict_Prob_RegLog'] = reg_log.predict(df)

In [None]:
# Graficamente 
g1 = plt.figure()
sns.scatterplot(data = df,
                x = 'PassengerId',
                y = 'Predict_Prob_RegLog',  
                alpha = 0.75,
                hue = 'Survived')

In [None]:
# Histogramas de predicción 
g1 = plt.figure()
sns.histplot(data = df,
             x = 'Predict_Prob_RegLog',  
             alpha = 0.5,
             hue = 'Survived',
             stat = 'probability', 
             bins = 15)

In [None]:
# Groupby por prediccion de clase: 
df['Predict_RegLog_Clase'] = df['Predict_Prob_RegLog'] > 0.5
resultados = df.groupby('Predict_RegLog_Clase').agg(
    Cant = ('Predict_RegLog_Clase', 'count'),              
    Cant_Sup = ('Survived', 'sum'),            
    Tasa_Sup = ('Survived', 'mean')    
).reset_index()
resultados

In [148]:
# Groupby por prediccion de decil: 
bin_edges = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bin_labels = range(1, 11)
df['Predict_RegLog_Bin'] = pd.cut(df['Predict_Prob_RegLog'], 
                                  bins=bin_edges, 
                                  labels=bin_labels, 
                                  include_lowest=True)


In [None]:
resultados_nodo = df.groupby('Predict_RegLog_Bin').agg(     
    Cant = ('Predict_RegLog_Bin', 'count'),              
    Cant_Sup = ('Survived', 'sum'),            
    Tasa_Sup = ('Survived', 'mean')    
).reset_index()
resultados_nodo

In [None]:
# Barplot por nodo 
resultados_nodo['Tasa_NoSup'] = (1 - resultados_nodo['Tasa_Sup'])
sns.barplot(data = resultados_nodo,
            x = 'Predict_RegLog_Bin',
            y = 'Tasa_Sup', 
            label = 'Porcentaje Supervivencia',
            color = '#48E074')
sns.barplot(data=resultados_nodo, 
            x = 'Predict_RegLog_Bin', 
            y = 'Tasa_NoSup', 
            bottom=resultados_nodo['Tasa_Sup'], 
            label = 'Porcentaje No Supervivencia',
            color = '#E04848')

# Ahora les toca a ustedes.
Se van a juntar en salas de a 4 personas y van a generar dos modelos adicionales: 

1) Regresión logística con variable Fare
2) Regresión logística con variable logaritmo natural de Fare

Mejoraron las predicciones? Cómo sacan esa conclusión? 

In [None]:
import nltk

# Print the list of directories where NLTK will look for data
print(nltk.data.path)


In [None]:
import os
import shutil
import nltk

# Print the list of directories where NLTK will look for data
print("NLTK will look for data in these directories:")
for path in nltk.data.path:
    print(path)

# Ask for confirmation before deleting
confirmation = input("Do you really want to delete all these directories and their contents? (yes/no): ")

if confirmation.lower() == 'yes':
    for path in nltk.data.path:
        # Check if the directory exists
        if os.path.exists(path):
            try:
                # Attempt to delete the directory and its contents
                shutil.rmtree(path)
                print(f"Successfully deleted {path}")
            except Exception as e:
                print(f"Error deleting {path}: {e}")
        else:
            print(f"Directory does not exist: {path}")
else:
    print("Deletion cancelled.")


In [None]:
pip uninstall nltk

In [1]:
import os
import shutil
import nltk

# Print the list of directories where NLTK will look for data
print("NLTK will look for data in these directories:")
for path in nltk.data.path:
    print(path)

NLTK will look for data in these directories:
C:\Users\QY539GE/nltk_data
C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\nltk_data
C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\share\nltk_data
C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\lib\nltk_data
C:\Users\QY539GE\AppData\Roaming\nltk_data
C:\nltk_data
D:\nltk_data
E:\nltk_data


In [1]:
pip install nltk


