In [55]:
# STEP 1: LOADING THE DATA FRAME

# importing relevent libraries
import pandas as pd
import seaborn as sns
import numpy as np
import statistics as stats
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score

# reading our data frame
data = pd.read_csv("House-Price.csv", header = 0)

# Transforming variables
sums = [data.dist1, data.dist2, data.dist3, data.dist4]
data['avg_dis'] = sum(sums) / len(sums)
del data['dist1']
del data['dist2']
del data['dist3']
del data['dist4']

# Removing outliers
data.n_hos_beds = data.n_hos_beds.fillna(data.n_hos_beds.mean())

# Adding Dummy variables
data = pd.get_dummies(data)
del data["airport_NO"]
del data["waterbody_None"]

In [20]:
# independent variable
x = data[['price']]
x.head()


# dependent variable
y = data['Sold']

# Logictic Regression with SINGLE predictor

In [44]:
# Training a simple logistic model through sktlearn

# First, create classification object
clf_lrSingleVariable = LogisticRegression()

# Second, fit that object using our independent and dependent variables
clf_lrSingleVariable.fit(x,y)
print(clf_lrSingleVariable.coef_, clf_lrSingleVariable.intercept_)

# Third, predict our dependent variable using that trained classification model
        # This is based on formula on slides
clf_lrSingleVariable.predict_proba(x)

[[-0.24829337 -0.013714   -0.14838597  0.82242496 -0.00547886  0.23873918
  -0.21745025  0.15973497 -0.07733501 -0.00693305 -0.00704927 -0.32478988
  -0.12461451 -0.06244828 -0.01464801  0.17000533  0.0040346 ]] [0.00573129]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[1.23489690e-01, 8.76510310e-01],
       [3.79394369e-01, 6.20605631e-01],
       [9.99548346e-01, 4.51654128e-04],
       ...,
       [2.73547825e-01, 7.26452175e-01],
       [2.72884831e-01, 7.27115169e-01],
       [1.71142583e-01, 8.28857417e-01]])

In [22]:
# Training a simple logistic model through stats library

# by default, no constant is not included. So we must include it MANUALLY

# adding the constant (B0) manually
x_constant = sn.add_constant(x)
x_constant.head()

# creating the classification object(while also including the constant (B0)) and fitting our variables
logit = sm.Logit(y,x_constant).fit()

# Here, we are viewing a summary of our logistic model (not possible through sktlearn model)
logit.summary()

Optimization terminated successfully.
         Current function value: 0.676690
         Iterations 5


0,1,2,3
Dep. Variable:,Sold,No. Observations:,506.0
Model:,Logit,Df Residuals:,504.0
Method:,MLE,Df Model:,1.0
Date:,"Sat, 07 May 2022",Pseudo R-squ.:,0.01788
Time:,10:56:29,Log-Likelihood:,-342.41
converged:,True,LL-Null:,-348.64
Covariance Type:,nonrobust,LLR p-value:,0.0004142

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.6149,0.248,2.484,0.013,0.130,1.100
price,-0.0357,0.010,-3.417,0.001,-0.056,-0.015


# Logistic Regression with MULTIPLE predictors

In [42]:
# USING SKTLEARN

# STEP 1: CREATING INDEPENDENT AND DEPENDENT VARIABLES

# independent variables(s)
x = data.loc[:, data.columns != 'Sold']
# dependent variable
y = data['Sold']

# STEP 2: CREATING OUR LOGISTIC REGRESSION MODEL AND FITTING OUR VARIABLES
clf_lrMULTIVARIABLE = LogisticRegression()
clf_lrMULTIVARIABLE.fit(x,y)

print(clf_lrMULTIVARIABLE.coef_, clf_lrMULTIVARIABLE.
      intercept_)
# STEP 3: PREDICTING DEPENDENT VARIABLE VALUES BASED ON EQUATION FROM SLIDES
clf_lrMULTIVARIABLE.predict_proba(x)

[[-0.24829337 -0.013714   -0.14838597  0.82242496 -0.00547886  0.23873918
  -0.21745025  0.15973497 -0.07733501 -0.00693305 -0.00704927 -0.32478988
  -0.12461451 -0.06244828 -0.01464801  0.17000533  0.0040346 ]] [0.00573129]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[1.23489690e-01, 8.76510310e-01],
       [3.79394369e-01, 6.20605631e-01],
       [9.99548346e-01, 4.51654128e-04],
       ...,
       [2.73547825e-01, 7.26452175e-01],
       [2.72884831e-01, 7.27115169e-01],
       [1.71142583e-01, 8.28857417e-01]])

In [40]:
# USING STATSMODEL LIBRARY

# STEP 1: CREATING INDEPENDENT AND DEPENDENT VARIABLES

# independent variables(s)
x = data.loc[:, data.columns != 'Sold']
# dependent variable
y = data['Sold']


# STEP 2: ADDING B0 TO OUR LOGISTIC REGRESSION MODEL
x_constant = sn.add_constant(x)


# STEP 3: CREATING OUR LOGISTIC REGRESSION MODEL AND FITTING OUR VARIABLES
logit = sm.Logit(y, x_constant).fit()

logit.summary()

Optimization terminated successfully.
         Current function value: 0.556191
         Iterations 7


0,1,2,3
Dep. Variable:,Sold,No. Observations:,506.0
Model:,Logit,Df Residuals:,489.0
Method:,MLE,Df Model:,16.0
Date:,"Sat, 07 May 2022",Pseudo R-squ.:,0.1928
Time:,13:00:58,Log-Likelihood:,-281.43
converged:,True,LL-Null:,-348.64
Covariance Type:,nonrobust,LLR p-value:,8.894e-21

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
price,-0.2739,0.033,-8.315,0.000,-0.338,-0.209
resid_area,0.0197,0.027,0.738,0.461,-0.033,0.072
air_qual,-7.4281,2.691,-2.761,0.006,-12.702,-2.154
room_num,1.1092,0.277,4.010,0.000,0.567,1.651
age,-0.0020,0.007,-0.302,0.763,-0.015,0.011
teachers,0.3152,0.064,4.941,0.000,0.190,0.440
poor_prop,-0.2073,0.034,-6.140,0.000,-0.273,-0.141
n_hos_beds,0.1759,0.071,2.467,0.014,0.036,0.316
n_hot_rooms,-0.0644,0.055,-1.169,0.242,-0.173,0.044


# Confusion Matrix and evaluating performance

In [54]:
# predicting dependent values for multivariable regression
        # NOTE: this gives the probability of each class being assigned to each observation
predictProbMany = clf_lrMULTIVARIABLE.predict_proba(x)

# if the probability of "True" class being assigned is greater than 0.5, than that class will be assigned  
predictMany = clf_lrMULTIVARIABLE.predict(x)

# assigning boolean values to each entry of the array defined by "predictMany"
y_pred05 = (predictProbMany[:,1]) >= 0.5
y_pred05

# creating the confusion matrix
    # NOTE: the first argument represents acutal values of dependent variable
    # NOTE: the second argument represent predicted values of dependent variable above the boundary threshold
confusion_matrix(y, y_pred05)

# Getting the precision score
precision = precision_score(y, y_pred05)
# Getting the recall score
recall = recall_score(y, y_pred05)
# Getting the ROC area score
rocArea = roc_auc_score(y, y_pred05)

print(rocArea)

0.6833333333333333
