# Capítulo 4 ISLP

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
weekly = pd.read_csv("../db/Weekly.csv")
weekly.head(3)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up


In [4]:
eda = ProfileReport(weekly, title="EDA")
eda.to_file("../db/eda_weekly.html")

Summarize dataset: 100%|██████████| 82/82 [00:10<00:00,  7.80it/s, Completed]                 
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 74.78it/s]


In [5]:
# Use the full data set to perform a logistic regression
X = weekly[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]]
y = weekly["Direction"]

model = LogisticRegression()
model.fit(X, y)
print(f"Los coeficientes son {model.coef_}")
print(f"El intercepto es {model.intercept_}")

Los coeficientes son [[-0.04123352  0.05839182 -0.01604123 -0.02777908 -0.01445306 -0.02273943]]
El intercepto es [0.26685095]


In [6]:
# Ver significancia de los coeficientes
matriz_cov = model.coef_.T @model.coef_
errores_estandar = np.sqrt(np.diag(matriz_cov))
z_score = 1.96
c_inferior = model.coef_ - z_score * errores_estandar
c_superior = model.coef_ + z_score * errores_estandar

df_intervalos = pd.DataFrame({"Coeficiente": model.coef_.flatten(), 
                              "Límite inferior": c_inferior.flatten(),
                              "Límite superior": c_superior.flatten()},
                              index=X.columns)

df_intervalos

Unnamed: 0,Coeficiente,Límite inferior,Límite superior
Lag1,-0.041234,-0.122051,0.039584
Lag2,0.058392,-0.056056,0.17284
Lag3,-0.016041,-0.047482,0.0154
Lag4,-0.027779,-0.082226,0.026668
Lag5,-0.014453,-0.042781,0.013875
Volume,-0.022739,-0.067309,0.02183


In [7]:
# Predecir valores para evaluar el funcionamiento del modelo
y_pred = model.predict(X)

# calcular errores
exactitud = accuracy_score(y, y_pred, normalize=True)
exactitud

0.5610651974288338

In [11]:
matriz_confusion = confusion_matrix(y, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(matriz_confusion, annot=True, fmt="d", cmap="Blues", xticklabels=["Down", "Up"], yticklabels=["Down", "Up"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
print(matriz_confusion)

[[ 54 430]
 [ 48 557]]


  plt.show()


Observamos que el modelo presenta un sesgo hacia clasificador "todo" como un positivo. Dicho de otra manera, tenemos principalmente error del tipo II.