## HOJA DE TRABAJO 8 - REDES NEURONALES

In [154]:
# Importando librerias necesarias

import matplotlib.pyplot as plt
import numpy as np
import random
import statsmodels.api as sm
import statsmodels.stats.diagnostic as diag
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

#Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score

### Preparación de datos

In [155]:
# Cargando dataset con pandas
data = pd.read_csv('train.csv')
data = pd.DataFrame(data)


# Limpieza de datos
columns_used = []

data['neighborhood'] = 0
NEIGHBORHOOD = list(set(nh for nh in data['Neighborhood']))
for index, neighborhood_name in enumerate(NEIGHBORHOOD):
    data.loc[data['Neighborhood'] == neighborhood_name, 'neighborhood'] = index + 1
columns_used.append('neighborhood')

data['houseStyle'] = 0
HOUSE_STYLE = list(set(hs for hs in data['HouseStyle']))
for index, houseStyle_name in enumerate(HOUSE_STYLE):
    data.loc[data['HouseStyle'] == houseStyle_name, 'houseStyle'] = index + 1
columns_used.append('houseStyle')

data['houseZone'] = 0
MS_ZONING = list(set(msz for msz in data['MSZoning']))
for index, MSZoning_name in enumerate(MS_ZONING):
    data.loc[data['MSZoning'] == MSZoning_name, 'houseZone'] = index + 1
columns_used.append('houseZone')

data['houseUtilities'] = 0
UTILITIES = list(set(u for u in data['Utilities']))
for index, utils in enumerate(UTILITIES):
    data.loc[data['Utilities'] == utils, 'houseUtilities'] = index + 1
columns_used.append('houseUtilities')

data['roadAccess'] = 0
CONDITION_1 = list(set(c1 for c1 in data['Condition1']))
for index, c1 in enumerate(CONDITION_1):
    data.loc[data['Condition1'] == c1, 'roadAccess'] = index + 1
columns_used.append('roadAccess')

data['remodelated'] = 0
data.loc[data['YearBuilt'] != data['YearRemodAdd'], 'remodelated'] = 1
columns_used.append('remodelated')

data['roofStyle'] = 0
ROOF_STYLE = list(set(rs for rs in data['RoofStyle']))
for index, rs in enumerate(ROOF_STYLE):
    data.loc[data['RoofStyle'] == rs, 'roofStyle'] = index + 1
columns_used.append('roofStyle')

data['roofMaterial'] = 0
ROOF_MATL = list(set(rm for rm in data['RoofMatl']))
for index, rm in enumerate(ROOF_MATL):
    data.loc[data['RoofMatl'] == rm, 'roofMaterial'] = index + 1
columns_used.append('roofMaterial')

data['exteriorCondition'] = 0
EXTER_COND = list(set(ec for ec in data['ExterCond']))
for index, ec in enumerate(EXTER_COND):
    data.loc[data['ExterCond'] == ec, 'exteriorCondition'] = index + 1
columns_used.append('exteriorCondition')

data['foundationMaterial'] = 0
FOUNDATION = list(set(f for f in data['Foundation']))
for index, f in enumerate(FOUNDATION):
    data.loc[data['Foundation'] == f, 'foundationMaterial'] = index + 1
columns_used.append('foundationMaterial')

data['basement'] = 0
data.loc[~data['BsmtQual'].isna(), 'basement'] = 1
columns_used.append('basement')

data['basementCondition'] = 1
data.loc[data['BsmtCond'] == "Ex", 'basementCondition'] = 3
data.loc[data['BsmtCond'] == "Gd", 'basementCondition'] = 2
data.loc[data['BsmtCond'].isna(), 'basementCondition'] = 0
columns_used.append('basementCondition')

data['fireplace'] = 0
data.loc[~data['FireplaceQu'].isna(), 'fireplace'] = 1
columns_used.append('fireplace')

data['pool'] = 0
data.loc[~data['PoolQC'].isna(), 'pool'] = 1
columns_used.append('pool')

data['additionalFeature'] = 0
data.loc[~data['MiscFeature'].isna(), 'additionalFeature'] = 1
columns_used.append('additionalFeature')

data['saleType'] = 0
SALE_TYPE = list(set(st for st in data['SaleType']))
for index, st in enumerate(SALE_TYPE):
    data.loc[data['SaleType'] == st, 'saleType'] = index + 1
columns_used.append('saleType')

data['overallQuality'] = data['OverallQual']
columns_used.append('overallQuality')

data['overallCondition'] = data['OverallCond']
columns_used.append('overallCondition')

data['livingArea'] = data['GrLivArea']
columns_used.append('livingArea')

data['yearBuilt'] = data['YearBuilt']
columns_used.append('yearBuilt')

data['salePrice'] = data['SalePrice']
columns_used.append('salePrice')

columns_not_used = [x for x in data.columns if x not in columns_used]
data = data.drop(columns_not_used, axis=1)

data.head()


Unnamed: 0,neighborhood,houseStyle,houseZone,houseUtilities,roadAccess,remodelated,roofStyle,roofMaterial,exteriorCondition,foundationMaterial,basement,basementCondition,fireplace,pool,additionalFeature,saleType,overallQuality,overallCondition,livingArea,yearBuilt,salePrice
0,5,4,5,2,3,0,6,2,3,3,1,1,0,0,0,6,7,5,1710,2003,208500
1,19,1,5,2,9,0,6,2,3,1,1,1,1,0,0,6,6,8,1262,1976,181500
2,5,4,5,2,3,1,6,2,3,3,1,1,1,0,0,6,7,5,1786,2001,223500
3,13,4,5,2,3,1,6,2,3,4,1,2,1,0,0,6,7,5,1717,1915,140000
4,18,4,5,2,3,0,6,2,3,3,1,1,1,0,0,6,8,5,2198,2000,250000


### 2. Variable respuesta - Categorización por precios

In [156]:
cheap_cut = data['salePrice'].quantile(0.25)
expensive_cut = data['salePrice'].quantile(0.75)

data['economy'] = 0
data.loc[data['salePrice'] < cheap_cut, 'economy'] = 'barata'
data.loc[data['salePrice'] > expensive_cut, 'economy'] = 'cara'
data.loc[(data['salePrice'] >= cheap_cut) & (data['salePrice'] <= expensive_cut), 'economy'] = 'media'

data.head()

Unnamed: 0,neighborhood,houseStyle,houseZone,houseUtilities,roadAccess,remodelated,roofStyle,roofMaterial,exteriorCondition,foundationMaterial,basement,basementCondition,fireplace,pool,additionalFeature,saleType,overallQuality,overallCondition,livingArea,yearBuilt,salePrice,economy
0,5,4,5,2,3,0,6,2,3,3,1,1,0,0,0,6,7,5,1710,2003,208500,media
1,19,1,5,2,9,0,6,2,3,1,1,1,1,0,0,6,6,8,1262,1976,181500,media
2,5,4,5,2,3,1,6,2,3,3,1,1,1,0,0,6,7,5,1786,2001,223500,cara
3,13,4,5,2,3,1,6,2,3,4,1,2,1,0,0,6,7,5,1717,1915,140000,media
4,18,4,5,2,3,0,6,2,3,3,1,1,1,0,0,6,8,5,2198,2000,250000,cara


### 1. Partición en conjuntos de entrenamiento y prueba

In [157]:
## Dividir el dataset en train y test

# Dividir el dataset en X e y
X = data.drop(['salePrice', 'economy'], axis=1)
y = data['economy']

# 30% de los datos para test y 70% para train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=123)

### 3. Generación de modelos para clasificar usando variable respuesta
Genere dos modelos de redes neuronales que sean capaz de clasificar usando la variable respuesta que categoriza las casas en baratas, medias y caras. Estos modelos deben tener diferentes topologías y funciones de activación.

#### Normalización de los datos

In [158]:
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#### Creación de modelo 1

In [165]:
# Modelo 1
mlp1 = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=3000, activation='relu', random_state=123)
mlp1.fit(X_train, y_train)

#### Creación del modelo 2

In [166]:
# Modelo 2
mlp2 = MLPClassifier(hidden_layer_sizes=(20, 10, 5), max_iter=3000, activation='tanh', random_state=123)
mlp2.fit(X_train, y_train)

### 4. Predicción de la variable respuesta

In [167]:
# Predicciones
y_pred1 = mlp1.predict(X_test)
y_pred2 = mlp2.predict(X_test)

### 5. Matrices de confusión

In [174]:
cm1 = confusion_matrix(y_test, y_pred1)
cm1_accuracy = accuracy_score(y_test, y_pred1)
cm1_precision = precision_score(y_test, y_pred1, average='macro')
cm1_recall = recall_score(y_test, y_pred1, average='macro')

cm2 = confusion_matrix(y_test, y_pred2)
cm2_accuracy = accuracy_score(y_test, y_pred2)
cm2_precision = precision_score(y_test, y_pred2, average='macro')
cm2_recall = recall_score(y_test, y_pred2, average='macro')

print(f"Matriz de confusión del modelo 1\n{cm1}\nAccuracy: {cm1_accuracy}\nPrecision: {cm1_precision}\nRecall: {cm1_recall}\n")
print(f"Matriz de confusión del modelo 2\n{cm2}\nAccuracy: {cm2_accuracy}\nPrecision: {cm2_precision}\nRecall: {cm2_recall}\n")


Matriz de confusión del modelo 1
[[ 80   1  17]
 [  0  94  25]
 [ 17  18 186]]
Accuracy: 0.821917808219178
Precision: 0.8241300496016978
Recall: 0.8159571520916059

Matriz de confusión del modelo 2
[[ 77   1  20]
 [  0  91  28]
 [ 30  19 172]]
Accuracy: 0.776255707762557
Precision: 0.7737547232874334
Recall: 0.7762335703512173

