In [10]:
# STEP 1: LOADING THE DATA FRAME

# importing relevent libraries
import pandas as pd
import seaborn as sns
import numpy as np
import statistics as stats
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from  sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# reading our data frame
data = pd.read_csv("House-Price.csv", header = 0)

# Transforming variables
sums = [data.dist1, data.dist2, data.dist3, data.dist4]
data['avg_dis'] = sum(sums) / len(sums)
del data['dist1']
del data['dist2']
del data['dist3']
del data['dist4']

# Removing missing values
data.n_hos_beds = data.n_hos_beds.fillna(data.n_hos_beds.mean())

# Adding Dummy variables
data = pd.get_dummies(data)
del data["airport_NO"]
del data["waterbody_None"]

In [3]:
# independent variable
x = data[['price']]
x.head()


# dependent variable
y = data['Sold']

# Training our Data

In [5]:
# Splitting our data set
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

# Creating a classifying object (in form of logistic regression model) to train our training data set
clf_TrainingModel = LogisticRegression()

# Fitting our variables into our classifying object
clf_TrainingModel.fit(x_train, y_train)

# predicting/testing the values of dependent variable based on our model
y_test_pred = clf_TrainingModel.predict(x_test)

# assessing the accuracy/performance of the model using a confusion matrix
confusion_matrix(y_test, y_test_pred)

#accuracy_score(y_test, y_test_pred)
#roc_auc_score(y_test, y_test_pred)

array([[44, 14],
       [30, 14]], dtype=int64)

# K-Nearest Neighbors Classifying Approach for SINGLE values of K

In [6]:
# standarizing our independent variables for training set
scalerTrain = preprocessing.StandardScaler().fit(x_train)
x_train_standard = scalerTrain.transform(x_train)

# standarizing our independent variables for training set
scalerTest = preprocessing.StandardScaler().fit(x_test)
x_test_standard = scalerTest.transform(x_test)

In [7]:
# creating our classifying object using the KNN approach
        # NOTE: "n_neighbors = 1" refers to k-value
knnObject1 = KNeighborsClassifier(n_neighbors = 1)

# fitting our variables into our classifying object
knnObject1.fit(x_train_standard, y_train)

# predicting/testing the values of dependent variable based on our model
knnObject1_pred = knnObject1.predict(x_test_standard)

# assessing the accuracy/performance of the model using a confusion matrix
confusion_matrix(y_test, knnObject1_pred)
accuracy_score(y_test, knnObject1_pred)

#roc_auc_score(y_test, y_test_pred)

0.5294117647058824

## Creating a single KNN classifying object for MULTIPLE values of K

In [9]:


valuesOfK = {'n_neighbors': [i for i in range(1,11)]}

# creating our classifying objects based on KNN approach
grid_search_cv = GridSearchCV(KNeighborsClassifier(), valuesOfK)

# fitting our training data variables into all of our models
grid_search_cv.fit(x_train_standard, y_train)

# finding the best value of k from the list of k-values assigned to model
grid_search_cv.best_params_

# chossing the best clasifying object model
optimized_KNN = grid_search_cv.best_estimator_

# predicting the values/classes of the dependent variable based on optimim classifying object model created
y_test_pred = optimized_KNN.predict(x_test_standard)


# analysing the performance of our classifying object mode;
confusion_matrix(y_test, y_test_pred)
accuracy_score(y_test, y_test_pred)

0.5294117647058824