# Laboratorio 6: Regresión Logística #

### Importación de librerías ###

In [2]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### Funciones Predeterminadas ###

In [102]:
def hipo(X, theta):
    z = np.dot(theta, X.T)
    return 1/(1+np.exp(-(z))) - 0.0000001


def costo(X, y, theta):
    y1 = hipo(X, theta)
    return -(1/len(X)) * np.sum(y*np.log(y1) + (1-y)*np.log(1-y1))


def gradiente(X, y, theta, learning_rate, max_iter):
    m =len(X)
    J = [costo(X, y, theta)] 
    for i in range(0, max_iter):
        h = hipo(X, theta)
        for i in range(0, len(X)):
            theta = theta - (learning_rate * costo(X, y, theta))
        J.append(costo(X, y, theta))
    return J, theta

### Descarga de la data y preparación ###

Se carga el dataset, se le modifica los nombres a las columnas para que estén los correctos y se observa cómo se distribuye inicialmente la data.

In [4]:
nombres = ['tipos', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of Ash', 'Magnesium',
           'Total Phenols', 'Flavanoids', 'Nonflavanoid Phenols', 'Proanthocyanins',
           'Color Intensity', 'Hue', 'OD280/OD315 of diluted Wines', 'Proline']
dataset = pd.read_csv('wine_data.csv', names = nombres)
dataset['unos'] = 1

In [5]:
dataset.head(5)

Unnamed: 0,tipos,Alcohol,Malic Acid,Ash,Alcalinity of Ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color Intensity,Hue,OD280/OD315 of diluted Wines,Proline,unos
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [6]:
dataset.describe()

Unnamed: 0,tipos,Alcohol,Malic Acid,Ash,Alcalinity of Ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color Intensity,Hue,OD280/OD315 of diluted Wines,Proline,unos
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,1.0
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.0
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,1.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,1.0
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,1.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,1.0


In [7]:
dataset.dtypes

tipos                             int64
Alcohol                         float64
Malic Acid                      float64
Ash                             float64
Alcalinity of Ash               float64
Magnesium                         int64
Total Phenols                   float64
Flavanoids                      float64
Nonflavanoid Phenols            float64
Proanthocyanins                 float64
Color Intensity                 float64
Hue                             float64
OD280/OD315 of diluted Wines    float64
Proline                           int64
unos                              int64
dtype: object

In [8]:
dataset['tipos'].unique()

array([1, 2, 3], dtype=int64)

In [24]:
X = dataset

In [25]:
print(X.shape)

(178, 15)


In [26]:
X.dtypes

tipos                             int64
Alcohol                         float64
Malic Acid                      float64
Ash                             float64
Alcalinity of Ash               float64
Magnesium                         int64
Total Phenols                   float64
Flavanoids                      float64
Nonflavanoid Phenols            float64
Proanthocyanins                 float64
Color Intensity                 float64
Hue                             float64
OD280/OD315 of diluted Wines    float64
Proline                           int64
unos                              int64
dtype: object

In [27]:
X1 = X[X['tipos'] == 1]
X2 = X[X['tipos'] == 2]
X3 = X[X['tipos'] == 3]

In [28]:
print(X1['tipos'].unique())
print(X2['tipos'].unique())
print(X3['tipos'].unique())

[1]
[2]
[3]


In [29]:
y1 = X1['tipos']
y2 = X2['tipos']
y3 = X3['tipos']

In [30]:
print(X1.shape, X2.shape, X3.shape, y1.shape, y2.shape, y3.shape)

(59, 15) (71, 15) (48, 15) (59,) (71,) (48,)


In [31]:
X1 = X1.drop('tipos', axis = 1)
X2 = X2.drop('tipos', axis = 1)
X3 = X3.drop('tipos', axis = 1)

In [32]:
X1.dtypes

Alcohol                         float64
Malic Acid                      float64
Ash                             float64
Alcalinity of Ash               float64
Magnesium                         int64
Total Phenols                   float64
Flavanoids                      float64
Nonflavanoid Phenols            float64
Proanthocyanins                 float64
Color Intensity                 float64
Hue                             float64
OD280/OD315 of diluted Wines    float64
Proline                           int64
unos                              int64
dtype: object

In [33]:
print(X1.shape, X2.shape, X3.shape, y1.shape, y2.shape, y3.shape)

(59, 14) (71, 14) (48, 14) (59,) (71,) (48,)


In [35]:
print(y1.unique())
print(y2.unique())
print(y3.unique())

[1]
[2]
[3]


### Estandarización, preparación y sparación de Train y Test

In [36]:
stdscl = StandardScaler()
X1_trans = stdscl.fit_transform(X1)
X2_trans = stdscl.fit_transform(X2)
X3_trans = stdscl.fit_transform(X3)

In [68]:
y1_arr = np.array(y1)
X1_arr = np.array(X1_trans)
y2_arr = np.array(y2)
X2_arr = np.array(X2_trans)
y3_arr = np.array(y3)
X3_arr = np.array(X3_trans)

### Pruebas ###

1

In [55]:
X1_train, X1_test = train_test_split(X1_arr, test_size=0.30)
y1_train, y1_test = train_test_split(y1_arr, test_size=0.30)

In [56]:
m1, n1 = X1_trans.shape
theta_01 = np.random.rand(n1, 1).T

In [49]:
costo1, theta1 = gradiente(X1_train, y1_train, theta_01, learning_rate = 0.00001, max_iter = 1500)
print(costo1)

ValueError: shapes (14,1) and (14,41) not aligned: 1 (dim 1) != 14 (dim 0)

In [51]:
print(X1_train.shape, y1_train.shape)

(41, 14) (41,)


In [57]:
y1_train = y1_train.reshape(41,1)

In [58]:
costo1, theta1 = gradiente(X1_train, y1_train, theta_01, learning_rate = 0.00001, max_iter = 1500)
print(costo1)

  


[61.51431863361212, 59.523663881474, 57.62798348412205, 55.82406370018652, 54.108841812729096, 52.47940350787352, 50.932979897918294, 49.4669437599312, 48.07880456148968, 46.7662018844905, 45.52689694220259, 44.35876201769535, 43.25976783256121, 42.22796907555301, 41.26148856424544, 40.358500752057495, 39.517215493018604, 38.73586310013759, 38.01268174845468, 37.34590816367446, 36.73377230565959, 36.17449642971717, 35.666298532545845, 35.207399817827856, 34.79603549997113, 34.43046804178733, 34.10900181223276, 33.82999815291508, 33.59188993933926, 33.39319488679647, 33.2325270498199, 33.10860616915075, 33.02026470857511, 32.9664525814277, 32.94623968661268, 32.958816456698806, 33.003492670333515, 33.07969480440416, 33.18696220504487, 33.324942347064756, 33.49338543370155, 33.69213856638023, 33.92113968965091, 34.180411490923916, 34.47005540854916, 34.790245875377416, 35.141224898299, 35.52329704760841, 35.93682490386495, 36.38222498493781, 36.859964153068624, 37.37055648206685, 37.9145

In [59]:
print(min(costo1))

32.94623968661268


2

In [69]:
X2_train, X2_test = train_test_split(X2_arr, test_size=0.30)
y2_train, y2_test = train_test_split(y2_arr, test_size=0.30)

In [70]:
m2, n2 = X2_trans.shape
theta_02 = np.random.rand(n2, 1).T

In [71]:
print(y2_train.shape, X2_train.shape)

(49,) (49, 14)


In [72]:
y2_train = y2_train.reshape(49,1)

In [73]:
costo2, theta2 = gradiente(X2_train, y2_train, theta_02, learning_rate = 0.00001, max_iter = 1500)
print(costo2)

  


[65.78704973182951, 63.74644156821288, 61.80665161916881, 59.96345983901156, 58.213118173179794, 56.552221932506576, 54.97764683452859, 53.48652263560468, 52.07622905952464, 50.74440736769237, 49.48898497791848, 48.30821274954734, 47.20071569692728, 46.165558239957505, 45.20232459853776, 44.31121332054475, 43.493141827067156, 42.74985198187248, 42.08400119830459, 41.49921651579427, 41.00008356379129, 40.59204120838451, 40.28115806419699, 40.073779100952606, 39.97604857423584, 39.99334145304037, 40.12967369440183, 40.387207127255266, 40.765989693192495, 41.264037855418415, 41.87776741659706, 42.60266006417995, 43.43398605907976, 44.36742006891016, 45.39946049348993, 46.527642027396915, 47.7505825209501, 49.067922597884305, 50.480210990979494, 51.988773723306366, 53.59559012211313, 55.30318708745577, 57.11455568185924, 59.03309024100044, 61.06254891579233, 63.2070352980526, 65.47100369059486, 67.85929689928516, 70.3772384916665, 73.03082919050422, 75.82715807301913, 78.77527881160847, 81

In [83]:
print(min(costo2))

39.97604857423584


3

In [81]:
X3_train, X3_test = train_test_split(X3_arr, test_size=0.30)
y3_train, y3_test = train_test_split(y3_arr, test_size=0.30)

In [84]:
m3, n3 = X3_trans.shape
theta_03 = np.random.rand(n3, 1).T

In [85]:
print(y3_train.shape, X3_train.shape)

(33,) (33, 14)


In [86]:
y3_train = y3_train.reshape(33,1)

In [87]:
costo3, theta3 = gradiente(X3_train, y3_train, theta_03, learning_rate = 0.00001, max_iter = 1500)
print(costo3)

  


[45.53458127171248, 44.61055833610388, 43.71504043088534, 42.84734296434601, 42.00679984281543, 41.192762746744435, 40.404600457441376, 39.64169823167301, 38.903457221027125, 38.189293932778895, 37.49863972902418, 36.830940361030514, 36.18565553608384, 35.56225851453734, 34.96023573524857, 34.37908646807696, 33.81832249255554, 33.27746780221322, 32.756058334277704, 32.253641724615825, 31.769777087764453, 31.304034821775435, 30.85599643735838, 30.425254410474984, 30.011412057148455, 29.61408342882414, 29.232893226185194, 28.8674767289116, 28.517479738496117, 28.182558530911937, 27.862379815677706, 27.556620697692058, 27.264968638118063, 26.98712141058548, 26.72278704904698, 26.47168378376789, 26.233539962146807, 26.008093951350293, 25.79509402009986, 25.5942981973672, 25.405474106215163, 25.2283987715629, 25.06285840125124, 24.908648140432845, 24.765571800002963, 24.63344156050959, 24.512077653720024, 24.401308024755846, 24.300967978416672, 24.2108998139667, 24.130952453231203, 24.06098

In [88]:
print(min(costo3))

23.839491381079487


Original

In [89]:
y = X['tipos']
X = X.drop('tipos', axis = 1)

In [91]:
print(X.shape, y.shape)

(178, 14) (178,)


In [92]:
X_trans = stdscl.fit_transform(X)
y_arr = np.array(y)
X_arr = np.array(X_trans)

In [93]:
X_train, X_test = train_test_split(X_arr, test_size=0.30)
y_train, y_test = train_test_split(y_arr, test_size=0.30)

In [94]:
m, n = X_trans.shape
theta_0 = np.random.rand(n, 1).T

In [95]:
print(y_train.shape, X_train.shape)

(124,) (124, 14)


In [96]:
y_train = y_train.reshape(124,1)

In [97]:
costo, theta = gradiente(X_train, y_train, theta_0, learning_rate = 0.00001, max_iter = 1500)
print(costo)

  


[111.65616484791892, 100.01859105428983, 97.20069059813065, 103.75744256906246, 121.09392134573496, 151.47007097045827, 198.74773833329527, 268.764579462671, 371.2592169297386, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan

In [98]:
print(min(costo))

97.20069059813065


### Resultados ###

Se puede observar que el valor más pequeño lo obtenemos de modelar la categoría 1. 