# Regresión logística múltiple

## Librerías

In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

## Datos

In [2]:
# Datos
# ==============================================================================
df = pd.DataFrame()

make = [0,0.21,0.06,0,0,0,0,0,0.15,0.06,0,0,0,0,0,0,0,0,0,0,
        0,0.05,0,0,0,0.05,0,0,0,0,1.17,0,0,0,0,0,0,0,0,0,
        0.3,0,0,0,0,0.15,0.18,0.49,0.46,0.46,0,0,0,0,0,0,0,0,0,0,
        0,0.9,0,0.08,0,0,0,0,0,0,0,0,0.18,0,0,0,0,0,0,0,0,0,0,0,
        0.08,0,0,0,0,0,0,0,0,0.29,0.26,0,0,0,0,0]
address = [0.64,0.28,0,0,0,0,0,0,0,0.12,0,0,0.69,0,0,0.42,0,
           0,0,0.63,0,0.07,0,0,0,0.07,0,0,0,0,0,0,0,0,0.68,0,
           0,0,0.48,0.41,0,0,0,0,0,0.45,0,0,0.3,0.46,0,0,0,0,
           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.63,0,0.06,0,0,0,
           0,0,0,0,0,0,0,0,0.16,0,0,0,0,0,0,0,1.28,0,0,0,0,0,0,0]
capitalAve = [3.756,5.114,9.821,3.537,3.537,3,1.671,2.45,9.744,1.729,1.312,
              1.243,3.728,2.083,1.971,5.659,4.652,35.461,1.32,3.509,3.833,
              2.569,4.857,1.131,5.466,2.565,5.466,2.611,4,2.687,1.966,3.909,
              1.39,1.7,3.826,2.777,4.142,5.431,3.1,3.851,2.132,4,4,1.283,2.145,
              5.301,1.733,1.468,5.891,3.887,3.482,1.04,1,1.625,4.411,2.5,3.571,
              1,1.75,2.285,10.012,2.766,2,2.324,1.8,2.38,2,3.388,1.75,1.444,
              2.215,1.932,1.873,1,2.647,1.142,1.909,1,2.322,1.689,1.538,1.745,
              1.437,2.51,2.747,3.125,2.848,1.538,2.813,2,1,3.195,1.952,2.847,
              1.837,1.942,5.5,1.571,1,1]
capitalLong = [61,101,485,40,40,15,4,11,445,43,6,11,61,7,24,55,31,
               95,4,91,9,66,12,5,22,66,22,12,11,66,10,11,11,5,30,
               6,12,78,61,121,30,12,12,4,38,130,12,8,193,40,5,2,1,
               7,28,11,28,1,7,7,251,12,11,18,5,8,4,28,5,5,22,11,29,
               1,16,2,5,1,11,10,4,12,3,12,86,17,26,4,121,7,1,21,10,
               60,11,8,10,3,1,1]
tipo = ['spam','spam','spam','spam','spam','spam','spam','spam','spam','spam',
        'spam','spam','spam','spam','spam','spam','spam','spam','spam','spam',
        'spam','spam','spam','spam','spam','spam','spam','spam','spam','spam',
        'spam','spam','spam','spam','spam','spam','spam','spam','spam','spam',
        'spam','spam','spam','spam','spam','spam','spam','spam','spam','spam',
        'nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam',
        'nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam',
        'nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam',
        'nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam',
        'nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam','nonspam']

df['make'] = make
df['address'] = address
df['capitalAve'] = capitalAve
df['capitalLong'] = capitalLong
df['tipo'] = tipo

df.head(3)

Unnamed: 0,make,address,capitalAve,capitalLong,tipo
0,0.0,0.64,3.756,61,spam
1,0.21,0.28,5.114,101,spam
2,0.06,0.0,9.821,485,spam


## Descripción de datos

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   make         100 non-null    float64
 1   address      100 non-null    float64
 2   capitalAve   100 non-null    float64
 3   capitalLong  100 non-null    int64  
 4   tipo         100 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 4.0+ KB


In [4]:
df["tipo"].value_counts()

nonspam    50
spam       50
Name: tipo, dtype: int64

In [5]:
df['tipo'] = np.where(df['tipo'] == 'spam', 1, 0)

print("Número de observaciones por clase")
print(df['tipo'].value_counts())
print("")

print("Porcentaje de observaciones por clase")
print(100 * df['tipo'].value_counts(normalize=True))

Número de observaciones por clase
0    50
1    50
Name: tipo, dtype: int64

Porcentaje de observaciones por clase
0    50.0
1    50.0
Name: tipo, dtype: float64


In [6]:
df['dira'] = np.where(df['address'] > 0.15, 1, 0)

In [7]:
df["dira"].value_counts()

0    86
1    14
Name: dira, dtype: int64

In [8]:
df.describe()

Unnamed: 0,make,address,capitalAve,capitalLong,tipo,dira
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.0558,0.0783,3.18816,37.15,0.5,0.14
std,0.172663,0.210291,3.708639,73.553303,0.502519,0.348735
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,0.0,0.0,1.72175,6.0,0.0,0.0
50%,0.0,0.0,2.415,12.0,0.5,0.0
75%,0.0,0.0,3.735,32.75,1.0,0.0
max,1.17,1.28,35.461,485.0,1.0,1.0


Se codifica la variable respuesta como 1 si es spam y 0 si no lo es, y se identifica cuantas observaciones hay de cada clase.

## Preparación de datos

In [9]:
# importar el "imputador"----------------------------------------
from sklearn.impute import SimpleImputer
# importar el "estandarizador"-----------------------------------
from sklearn.preprocessing import StandardScaler
# importar la clase pipeline"------------------------------------
from sklearn.pipeline import Pipeline
# definir el pipeline--------------------------------------------
num_pipeline = Pipeline([
        ("imputador", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ])
# aplicar el pipeline--------------------------------------------
df_tr = num_pipeline.fit_transform(df)
# importar clases------------------------------------------------
from sklearn.compose import ColumnTransformer
# atributos de las variables numericas---------------------------
num_attribs = list(df)
# definir full pipeline------------------------------------------
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs)
    ])
df_prepared = full_pipeline.fit_transform(df)

In [10]:
# División de los datos en train y test
# ==============================================================================
X = df.drop(columns = ['address','tipo'])
y = df['tipo']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y.values.reshape(-1,1),
                                        train_size   = 0.8,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

## Ajuste del modelo

In [12]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# ajustar el modelo----------------------------------------------
logistic_reg_m = LogisticRegression()
logistic_reg_m.fit(X_train, y_train)

LogisticRegression()

In [13]:
# intercepto
print("El intercepto es", logistic_reg_m.intercept_)

El intercepto es [-1.26632752]


In [14]:
# coeficientes de regresion
variables = X_train.columns.values.tolist()
# poner los coeficientes en otra lista---------------------------
coefs = logistic_reg_m.coef_.tolist()[0]
coeficientes = pd.DataFrame()
coeficientes['valor'] = variables
coeficientes['coeficiente'] = coefs
coeficientes

Unnamed: 0,valor,coeficiente
0,make,0.775664
1,capitalAve,0.495223
2,capitalLong,-0.002368
3,dira,0.457268


Recta de regresión:

$y = -1.26632752+0.775664*make+0.495223*capitalAve-0.00236801*capitalLong+0.457268*dira$

### Predicciones

In [15]:
# Predicciones con intervalo de confianza 
# ==============================================================================
clase_predicha = logistic_reg_m.predict(X_train)
clase_predicha

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0])

In [16]:
# calcular probabilidad------------------------------------------
clase_predicha_prob=logistic_reg_m.predict_proba(X_train)
# sacar por pantalla las probabilidades de 0 y de 1--------------
print("Pr(tipo=0|X_train)= %.4f" % clase_predicha_prob[0,0],",",
      "\nPr(tipo=1|X_train)= %.4f" % clase_predicha_prob[0,1],".")

Pr(tipo=0|X_train)= 0.6143 , 
Pr(tipo=1|X_train)= 0.3857 .


In [17]:
# obtener observacion--------------------------------------------
datas = pd.DataFrame(np.array([[0.01, 1.000, 32, 0]]), columns = variables)

In [18]:
# predecir clase------------------------------------------
clase_predicha1=logistic_reg_m.predict(datas)
print(clase_predicha1)

[0]


In [19]:
# calcular probabilidad------------------------------------------
clase_predicha_prob1=logistic_reg_m.predict_proba(datas)
# sacar por pantalla las probabilidades de 0 y de 1--------------
print("Pr(tipo=0|datas)= %.4f" % clase_predicha_prob1[0,0],",",
      "\nPr(tipo=1|datas)= %.4f" % clase_predicha_prob1[0,1],".")

Pr(tipo=0|datas)= 0.6983 , 
Pr(tipo=1|datas)= 0.3017 .


Elaborado por Jairo Rojas