In [6]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score, log_loss
from sklearn.preprocessing import MinMaxScaler
from time import time
import numpy as np

from utils.pipeline import create_pipeline
from utils.data_cleaning import load_and_clean

In [2]:
df = load_and_clean() #Load and clean dataset

In [3]:
x = df.drop('price', axis=1) #Features

y = df['price'] #Target

In [7]:
pipeline = create_pipeline(df,model=KNeighborsRegressor()) #Create pipeline for linear regression

In [None]:
X_train_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24) #Split data into training and testing sets

In [9]:
#debug
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("Missing values in x_train:", x_train.isnull().sum().sum())
print("Missing values in x_test:", x_test.isnull().sum().sum())

x_train shape: (49683, 41)
x_test shape: (12421, 41)
y_train shape: (49683,)
Missing values in x_train: 108556
Missing values in x_test: 26005


In [10]:
# Fit the pipeline and predict
start = time()
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
execution_time = time() - start

ValueError: Input X contains NaN.
KNeighborsRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
#Evaluation
mse = mean_squared_error(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)

log_loss = log_loss(y_test, y_pred)

cv_scores = cross_val_score(pipeline, x, y, cv=5, scoring='neg_mean_squared_error')