In [6]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score, log_loss
from sklearn.preprocessing import MinMaxScaler
from time import time
import numpy as np

from utils.pipeline import create_pipeline
from utils.data_cleaning import load_and_clean

In [2]:
df = load_and_clean() #Load and clean dataset

In [3]:
x = df.drop('price', axis=1) #Features

y = df['price'] #Target

In [7]:
pipeline = create_pipeline(df,model=KNeighborsRegressor()) #Create pipeline for linear regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24) #Split data into training and testing sets

In [None]:
#debug
print("x_train shape:", X_train.shape)
print("x_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("Missing values in x_train:", X_train.isnull().sum().sum())
print("Missing values in x_test:", X_test.isnull().sum().sum())

x_train shape: (49683, 41)
x_test shape: (12421, 41)
y_train shape: (49683,)
Missing values in x_train: 108556
Missing values in x_test: 26005


In [12]:
# Fit the pipeline and predict
start = time()
pipeline.fit(X_train, y_train)
X_test = pipeline.fit_transform(X_test)
y_pred = pipeline.predict(X_test)
execution_time = time() - start

AttributeError: This 'Pipeline' has no attribute 'fit_transform'

In [None]:
#Evaluation
mse = mean_squared_error(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)

log_loss = log_loss(y_test, y_pred)

cv_scores = cross_val_score(pipeline, x, y, cv=5, scoring='neg_mean_squared_error')