# Regresión logística (introducción)

In [None]:
!wget https://raw.githubusercontent.com/jordipereiragude/dataforcourses/refs/heads/main/Default.csv

In [None]:
import pandas as pd
df = pd.read_csv("Default.csv")
print(df.info())
print(df.head().to_markdown())

In [None]:
print("default:",df['default'].unique())

In [None]:
print(df['default'].describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='balance', y='income', hue='default')
plt.title('Scatter Plot')
plt.xlabel('Balance')
plt.ylabel('Income')
plt.legend(title='Default')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
df.boxplot(column='balance', by='default')
plt.title('Boxplot por balance')
plt.xlabel('Default')
plt.ylabel('Balance')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
df.boxplot(column='income', by='default')
plt.title('Boxplot por income')
plt.xlabel('Default')
plt.ylabel('Income')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='student', hue='default', data=df)
plt.title('Count Plot of Default by Student')
plt.xlabel('Student')
plt.ylabel('Count')

for p in plt.gca().patches:
    if p.get_height() > 0:
        plt.gca().annotate(f'{int(p.get_height())}',
                           (p.get_x() + p.get_width() / 2., p.get_height()),
                           ha='center', va='center', fontsize=10, color='black',
                           xytext=(0, 5), textcoords='offset points')
plt.show()

In [None]:
# Para regresión logística el resultado debe ser numérico (0,1)
# Let's map 'Yes' to 1 and 'No' to 0
df['Y'] = df['default'].map({'Yes': 1, 'No': 0})

In [None]:
# Hagamos una regresión lineal
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='balance', y='Y')

plt.title('Regresión')
plt.xlabel('Balance')
plt.ylabel('Probabilidad de Default')
plt.show()

In [None]:
# Gráfico regresión logística
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='balance', y='Y', logistic=True)

plt.title('Regresión')
plt.xlabel('Balance')
plt.ylabel('Probabilidad de Default')
plt.show()

# Regresión logística (construcción del modelo)

In [None]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

model = smf.logit(formula='Y ~ balance', data=df)
result = model.fit()
# Resultados
print(result.summary())

Recordemos que el modelo consiste en:

$$ln\left(\frac{p}{1-p}\right)=\beta_0+\beta_1 x_1$$

Hay que pensar que:

* El modelo considera error la diferencia entre el "log" de las odds y el resultado.
* El valor de una "odd" puede tomar valores entre 0 (cuando $p$ tiende a 0) e $\infty$ ($p$ tiende a 1).
* El modelo es una "regresión": $y\sim \beta_0+\beta_1 x_1$
* El significado de los coeficientes debe plantearse desde el punto de vista del cambio de las "odds"

In [None]:
# predecir
predict_df = pd.DataFrame({'balance': [500, 1000, 2000]})
predictions = result.predict(predict_df)

# Display predicciones
print(predictions)

In [None]:
model = smf.logit(formula='Y ~ C(student)', data=df)
result = model.fit()
print(result.summary())

In [None]:
# predecir
predict_df = pd.DataFrame({'student': ['Yes', 'No']})
predictions = result.predict(predict_df)

# Display predicciones
print(predictions)

In [None]:
model = smf.logit(formula='Y ~ balance + C(student)', data=df)
result = model.fit()
print(result.summary())

# Regresión logística (selección de modelo)

In [None]:
model = smf.logit(formula='Y ~ balance + C(student) + income', data=df)
result = model.fit()

# Imprimimos resultados
print(result.summary())

In [None]:
model = smf.logit(formula='Y ~ balance + C(student)', data=df)
result = model.fit()

# Imprimimos el resultado y guardamos la probabilidad
print(result.summary())
df['predicted_probability'] = result.predict(df)

In [None]:
threshold = 0.5  # threshold, para escoger
df['prediccion'] = (df['predicted_probability'] >= threshold).astype(int)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Calculamos la matriz de confusión (hay que ir con cuidado en el formato ofrecido)
# true negatives (TN) , false positives (FP)
# false negatives (FN) , true positives (TP)

cm = confusion_matrix(df['Y'], df['prediccion'])
print(cm)

# Calcula diversos indicadores de calidad basados en
# falsos/verdaderos .. positivos/negativos
report = classification_report(df['Y'], df['prediccion'], output_dict=True)

# Métricas
print("precision:",report['1']['precision'])  # Precisión clase positiva (1)
print("accuracy:",report['accuracy'])
print("sensitivity:",report['1']['recall'])  # Sensitivity (recall) for the positive class (1)
print("specificity:",report['0']['recall'])  # Specificity (recall) for the negative class (0)

# Otras métricas
print("f1_score",report['1']['f1-score'])  # F1 Score for the positive class (1)
print("soporte:",report['1']['support'])  # Support for the positive class (1)
print("soporte:",report['0']['support'])  # Support for the negative class (0)

# Elección del valor de corte

In [None]:
def checkF1(threshold,d):
  d['prediccion'] = (d['predicted_probability'] >= threshold).astype(int) # calcula predicción con corte threshold
  report = classification_report(d['Y'], d['prediccion'], output_dict=True,zero_division=0) # obtén las métricas de "performance"  
  return report['1']['f1-score'] #entre las métricas retorna el F1-score para la clase positiva (1)

In [None]:
best=0
bestVal=0
ejeX=[]
ejeY=[]
for i in range(101):
  threshold=i/100 # vamos a probar con thresholds entre 0 y 1 en aumentos de 0.01
  val=checkF1(threshold,df)
  ejeX.append(threshold)
  ejeY.append(val)
  if val>bestVal:
    bestVal=val
    best=threshold
print("mejorthreshold",best,"con F1-score",bestVal)
plt.plot(ejeX,ejeY)
plt.show()
df['prediccion'] = (df['predicted_probability'] >= best).astype(int)
cm = confusion_matrix(df['Y'], df['prediccion'])
print(cm)

# Validación y prueba

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df,test_size=0.2)
print(train.shape[0])
print(test.shape[0])

In [None]:
modelTrain = smf.logit(formula='Y ~ balance + C(student)', data=train)
result = modelTrain.fit()
train['predicted_probability'] = result.predict(train)
best=0
bestVal=0
for i in range(101):
  threshold=i/100
  val=checkF1(threshold,train)
  if val>bestVal:
    bestVal=val
    best=threshold
print("threshold escogido:",best,"con f1 score",bestVal)

In [None]:
#hagamos test
test['probabilidad_prediccion']=result.predict(test)

#usamos best como threshold
test['prediccion'] = (test['predicted_probability'] >= bestVal).astype(int)
report = classification_report(test['Y'], test['prediccion'], output_dict=True)

# veamos las métricas
print("precision:",report['1']['precision'])  # Precision for the positive class (1)
print("accuracy:",report['accuracy'])
print("sensitivity:",report['1']['recall'])  # Sensitivity (recall) for the positive class (1)
print("specificity:",report['0']['recall'])  # Specificity (recall) for the negative class (0)
print("f1:",report['1']['f1-score'])

In [None]:
print("número de casos verdaderos:",test[test['Y']==1].shape[0])
cm = confusion_matrix(test['Y'], test['prediccion'])
print(cm)

# Undersampling

In [None]:
count_resultado_0 = df[df['Y'] == 0].shape[0]
count_resultado_1 = df[df['Y'] == 1].shape[0]
print("resultados con 0:",count_resultado_0)
print("resultados con 1:",count_resultado_1)

In [None]:
def undersampleTest():
  subset = df[df['Y'] == 0].sample(n=333)
  nuevo_set=df[df['Y'] == 1].copy()
  nuevo_set=pd.concat([nuevo_set,subset],ignore_index=True)
  model = smf.logit(formula='Y ~ balance + C(student)', data=nuevo_set)
  result = model.fit(disp=0)
  print(result.summary())
  nuevo_set['probabilidad_prediccion']=result.predict(test)
  #usamos 0,3 como threshold
  nuevo_set['prediccion'] = (nuevo_set['predicted_probability'] >= 0.3).astype(int)
  report = classification_report(nuevo_set['Y'], nuevo_set['prediccion'], output_dict=True)
  # veamos las métricas
  print("precision:",report['1']['precision'])  # Precision for the positive class (1)
  print("accuracy:",report['accuracy'])
  print("sensitivity:",report['1']['recall'])  # Sensitivity (recall) for the positive class (1)
  print("specificity:",report['0']['recall'])  # Specificity (recall) for the negative class (0)
  print("f1:",report['1']['f1-score'])
  print("número de casos verdaderos:",nuevo_set[nuevo_set['Y']==1].shape[0])
  cm = confusion_matrix(nuevo_set['Y'], nuevo_set['prediccion'])
  print(cm)

In [None]:
undersampleTest()