In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import statsmodels.stats.diagnostic as diag
from sklearn.metrics import silhouette_samples, silhouette_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn import metrics
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
import sklearn.preprocessing
import random
import graphviz
from sklearn.svm import SVC
from sklearn.svm import SVR
## import pyclustertend 
import matplotlib.cm as cm
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 

Análisis de los datos

In [None]:
data = pd.read_csv('./train.csv', encoding = "latin1")
file = open('cuantitativas.txt', 'r')
quant= file.read().splitlines()
file = open('cualitativas.txt', 'r')
quali= file.read().splitlines()

In [None]:
# Variables de interes
print('\033[36m' + 'Kurtosis: %f' % data['SalePrice'].kurt())
print('\033[36m' + 'Asimetría: %f' % data['SalePrice'].skew())
data['SalePrice'].describe()

In [None]:
stat,p = stats.shapiro(data[["SalePrice"]].dropna())
print('Prueba de Kolmogorov-Smirnov:\np=%f\n'% p)
ks_statistic, p_value = diag.lilliefors(data[["SalePrice"]].dropna())
print('Prueba de Lilliefors:\nks=%f\np=%f'%(ks_statistic,p_value))

In [None]:
sns.displot(data['SalePrice'], kde=True)

In [None]:
data.fillna(0)
minPrice = data['SalePrice'].min()
maxPrice = data['SalePrice'].max()
divs = (maxPrice - minPrice) / 3
data['priceRange'] = data['LotArea']

data['priceRange'][data['SalePrice'] < minPrice + divs] = 0.0 #Economico
data['priceRange'][data['SalePrice'] >= minPrice + divs] = 1.0 #Precio medio
data['priceRange'][data['SalePrice'] >= minPrice + divs * 2] = 2.0 #Caro

In [None]:
Y = data['priceRange']
X = data.drop(['priceRange'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, train_size=0.7)

Preprocesamiento

Procesamiento de variables cuantitativas y cualitativas

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_preprocessor = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

preprocesador = ColumnTransformer([
    ('one_hot_encoder',categorical_preprocessor,quali),
    ('numerico', numeric_preprocessor,quant)
],remainder="passthrough")

In [None]:
copieddata = data.copy()
copieddata = copieddata.fillna(0)

target = copieddata.pop('priceRange')
data = copieddata

In [None]:
for item in data:
  if (data[item].dtype == 'object'):
    data = data.astype({item: str})
X_train, X_test, y_train, y_test = train_test_split(data, target,test_size=0.3,train_size=0.7, random_state=42)

Preprocesamiento + modelo en un paso

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn import set_config

modelo = make_pipeline(preprocesador, SVC(kernel="linear"))

set_config(display='diagram')
modelo

In [None]:
Xtrain = X_train.apply(pd.to_numeric, errors='coerce')
yTrain = y_train.apply(pd.to_numeric, errors='coerce')
Xtrain.fillna(0, inplace=True)
yTrain.fillna(0, inplace=True)

Entrenamiento de modelo

In [None]:
modelo.fit(Xtrain,yTrain)

In [None]:
modelo.score(Xtrain,yTrain)

In [None]:
target_pred = modelo.predict(X_test)
print(target_pred)
print ("Accuracy:",metrics.accuracy_score(y_test, target_pred))
print ("Precision:", metrics.precision_score(y_test,target_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test,target_pred,average='weighted'))


In [None]:
from sklearn.metrics import  confusion_matrix
confusion_matrix(target_pred, y_test)

Ajuste de validacion cruzada

In [None]:
from sklearn.model_selection import cross_validate, cross_val_predict
cv = 8
cv_results = cross_validate(modelo,data, Y, cv=cv)
cv_results = pd.DataFrame(cv_results)
print("accuracy: "+str(cv_results['test_score'].mean()))

In [None]:
target_pred = cross_val_predict(modelo, data, Y, cv = cv)
confusion_matrix(Y, target_pred)

Con la realización de la validación cruzada, se puede mejorar el sobreajuste que se debe al modelo

In [None]:
modelo.get_params().keys()

In [None]:
modelo = make_pipeline(preprocesador, SVC(kernel="poly"))
_=modelo.fit(X_train,y_train)
param_grid = {
    'svc__C': (0.01, 0.1, 1, 5,16,32),
    'svc__degree':(2,3,5,7)
    }
model_grid_search = GridSearchCV(modelo, param_grid=param_grid,
                                 n_jobs=2, cv=5)
model_grid_search.fit(X_train, y_train)
accuracy = model_grid_search.score(X_test, y_test)
print("Accuracy: ",accuracy)
model_grid_search.best_params_

In [None]:
modelo = make_pipeline(preprocesador, SVC(kernel="rbf"))
_=modelo.fit(X_train,y_train)
param_grid = {
    'svc__C': (0.01, 0.1, 1, 5,16,32),
    'svc__gamma':(0.0000000002,0.00002,0.01,0.1,20,200)
    }
model_grid_search = GridSearchCV(modelo, param_grid=param_grid,
                                 n_jobs=2, cv=5)
model_grid_search.fit(X_train, y_train)
accuracy = model_grid_search.score(X_test, y_test)
print("Accuracy: ",accuracy)
model_grid_search.best_params_

In [None]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

In [None]:
# Log transform the target for official scoring
train = data[quant].dropna().copy()
y = train.pop("SalePrice")
X = train #El resto de los datos

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.3,train_size=0.7, random_state=42)

In [None]:
modelosvr = SVR(kernel='linear')

In [None]:
modelosvr.fit(X_train,y_train)

In [None]:
# Epsilon-Support Vector Regression.
y_pred = modelosvr.predict(X_test)
score = modelosvr.score(X_train, y_train)
score

# Comparación 

In [None]:
#Se transforman las colunas usando los preprocesadores
numeric_preprocessor = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

# Se preparan los preprocesadores
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")


preprocesador = ColumnTransformer([
    ('one_hot_encoder', categorical_preprocessor, quali),
    ('numerico', numeric_preprocessor, quant)
],remainder="passthrough")

In [None]:
for item in data:
  if (data[item].dtype == 'object'):
    data = data.astype({item: str})
X_train, X_test, y_train, y_test = train_test_split(data, target,test_size=0.3,train_size=0.7, random_state=42)

In [None]:
modelo = make_pipeline(preprocesador, SVC(kernel="linear"))
modelo.fit(Xtrain,yTrain)
print(modelo.score(X_test, y_test)) 
X_pred = modelo.predict(X_test)
print ("Accuracy:",metrics.accuracy_score(y_test, X_pred))
print ("Precision:", metrics.precision_score(y_test, X_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test, X_pred,average='weighted'))
print(confusion_matrix(X_pred, y_test))

In [None]:
modelo = make_pipeline(preprocesador, SVC(kernel="rbf", C=32, degree=2e-05))
modelo.fit(X_train,y_train)
print(modelo.score(X_test, y_test))
X_pred = modelo.predict(X_test)
print ("Accuracy:",metrics.accuracy_score(y_test, X_pred))
print ("Precision:", metrics.precision_score(y_test, X_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test, X_pred,average='weighted'))
print(confusion_matrix(X_pred, y_test))

In [None]:
modelo = make_pipeline(preprocesador, SVC(kernel="poly", C=32, degree=4))
modelo.fit(X_train,y_train)
print(modelo.score(X_test, y_test))
X_pred = modelo.predict(X_test)
print ("Accuracy:",metrics.accuracy_score(y_test, X_pred))
print ("Precision:", metrics.precision_score(y_test, X_pred,average='weighted') )
print ("Recall: ", metrics.recall_score(y_test, X_pred,average='weighted'))
print(confusion_matrix(X_pred, y_test))

Como se puede observar el recall generado en estos modelos está en un rango de 0.961-0.971 siendo el mayor el kernel lineal, con una distribución de 0.971, una exactitud de 0.972 y una presición de 0.971 y el modelo menos eficaz fue el modelo rbf. Así mismo, el modelo lienal es el que menos se demora en procesar, debido a que los datos no se proyectan en dimensiones más altas cuando se usa este núcleo.