In [4]:
#  --- Classification Models ---

#  Using
#     sklearn
#     statsmodel


In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# in sklearn model
# There are 3 steps :
#     * Classification object creation
#     * Fit object with x,y values
#     * Predict using trained data classification model

In [6]:
# classification object creation

clfs_lrs = LogisticRegression()

In [7]:
df = pd.read_csv('House_Price.csv')
df.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks,Sold
0,24.0,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347,0
1,21.6,37.07,0.469,6.421,78.9,4.99,4.7,5.12,5.06,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146,1
2,34.7,37.07,0.469,7.185,61.1,5.03,4.86,5.01,4.97,22.2,4.03,NO,7.394,101.12,,38,YES,0.045764,0
3,33.4,32.18,0.458,6.998,45.8,6.21,5.93,6.16,5.96,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151,0
4,36.2,32.18,0.458,7.147,54.2,6.16,5.86,6.37,5.86,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474,0


In [8]:
#  ----------------------------------------------------------
#  x, y are independent and dependant variables respectively
#  and independent variables will always be 2 dimensional
#  ----------------------------------------------------------

In [9]:
x = df[['price']]
y = df['Sold']

In [10]:
#  fit x,y to clfs_lrs object which is trained using LogisticRegression()

clfs_lrs.fit(x,y)

In [11]:
#  now we need to find beta1 and beta0 (probaility)

In [12]:
# for beta0

clfs_lrs.coef_

array([[-0.03571865]])

In [13]:
# for beta1

clfs_lrs.intercept_

array([0.61477516])

In [14]:
# --------------------------------------------------
#  multiple predictor using sklearn

In [15]:
# Regression model only use numerical values for training
# before that we need to solve outliers and missing values

In [16]:
df.describe()

Unnamed: 0,price,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,Sold
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,498.0,506.0,506.0,506.0,506.0
mean,22.528854,41.136779,0.554695,6.284634,68.574901,3.971996,3.628775,3.960672,3.618972,21.544466,12.653063,7.899767,13.041605,39.181818,0.054454,0.454545
std,9.182176,6.860353,0.115878,0.702617,28.148861,2.108532,2.10858,2.119797,2.099203,2.164946,7.141062,1.476683,5.238957,12.513697,0.010632,0.498422
min,5.0,30.46,0.385,3.561,2.9,1.13,0.92,1.15,0.73,18.0,1.73,5.268,10.0576,3.0,0.033292,0.0
25%,17.025,35.19,0.449,5.8855,45.025,2.27,1.94,2.2325,1.94,19.8,6.95,6.6345,11.1898,28.0,0.046464,0.0
50%,21.2,39.69,0.538,6.2085,77.5,3.385,3.01,3.375,3.07,20.95,11.36,7.999,12.72,39.0,0.053507,0.0
75%,25.0,48.1,0.624,6.6235,94.075,5.3675,4.9925,5.4075,4.985,22.6,16.955,9.088,14.1708,50.0,0.061397,1.0
max,50.0,57.74,0.871,8.78,100.0,12.32,11.93,12.32,11.94,27.4,37.97,10.876,101.12,60.0,0.086711,1.0


In [17]:
# outlier in n_hot_rooms

np.percentile(df.n_hot_rooms,[99][0])

15.39952

In [18]:
upper_v = np.percentile(df.n_hot_rooms,[99][0])

In [19]:
df.n_hot_rooms[(df.n_hot_rooms > 3 * upper_v)] = 3 * upper_v

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.n_hot_rooms[(df.n_hot_rooms > 3 * upper_v)] = 3 * upper_v


In [20]:
# outlier in rainfall

np.percentile(df.rainfall,[1][0])

20.0

In [21]:
lower_v = np.percentile(df.rainfall,[1][0])

In [22]:
df.rainfall[(df.rainfall < 0.3 * lower_v)] = 0.3 * lower_v

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rainfall[(df.rainfall < 0.3 * lower_v)] = 0.3 * lower_v


In [23]:
# missing values in n_hos_beds

df.n_hos_beds = df.n_hos_beds.fillna(df.n_hos_beds.mean())

In [24]:
df['Avg_dist'] = (df.dist1 + df.dist2 + df.dist3 + df.dist4)/4

In [25]:
del df['dist1']
del df['dist2']
del df['dist3']
del df['dist4']

In [26]:
# use dummy variables to replace non-numerical values

df = pd.get_dummies(df)

In [27]:
df.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,Sold,Avg_dist,airport_NO,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_None,waterbody_River,bus_ter_YES
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,0,4.0875,0,1,0,0,0,1,1
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,1,4.9675,1,0,1,0,0,0,1
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,0,4.9675,1,0,0,0,1,0,1
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,0,6.065,0,1,1,0,0,0,1
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,0,6.0625,1,0,1,0,0,0,1


In [28]:
#  assign all the variable to x using loc

x = df.loc[:, df.columns != 'Sold']

In [29]:
y = df.Sold

In [30]:
clfs_lrs.fit(x,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
clfs_lrs.coef_

array([[-0.25066419, -0.01773053, -0.12495443,  0.86208104, -0.00597609,
         0.24347021, -0.21564559,  0.15987085, -0.09100668, -0.00629276,
        -0.00573183, -0.33914928,  0.06321962, -0.05160104, -0.10032377,
        -0.02874492, -0.03869854,  0.17938581,  0.01161858]])

In [32]:
clfs_lrs.intercept_

array([0.01294574])

In [33]:
# ----------------------------------------------------
# ----------------- Confusion Matrix -----------------

#  there are two type error in Confusion matrix

#         Type I error - false positive
#         Type II error - false negative


In [34]:
#  generally 0.5 is used for prediction normally
    
#     > 0.5 = 1 (True)
#     < 0.5 = 0 (False)
    
clfs_lrs.predict_proba(x)

array([[0.1149744 , 0.8850256 ],
       [0.38455267, 0.61544733],
       [0.98104109, 0.01895891],
       ...,
       [0.27713903, 0.72286097],
       [0.28129244, 0.71870756],
       [0.17005803, 0.82994197]])

In [35]:
y_predict = clfs_lrs.predict(x)
y_predict

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [36]:
# if we want to use 0.3 for prediction we have to assign it

y_predict_03 = (clfs_lrs.predict_proba(x)[:1] >= 0.3).astype(bool)

In [37]:
# for creating confusion matrix 
from sklearn.metrics import confusion_matrix

In [38]:
confusion_matrix(y, y_predict)

array([[197,  79],
       [ 77, 153]])

In [64]:

# ------------------------------------------------------

In [40]:
# ------------ Linear Discriminant Analysis ------------ 

#  Naturally used for supervised classification problems
#  LDA/ Logical Regression are used in pre-processing steps
#  LDA is better than Logical Regression
#  Can be used to seperate when there are more than one classes
#  We can check it from higher dimension to lower dimenstion

#  Based on
#     Bayes Classifier:
#         Assign condition probability to all classes and assign the class with highest probability



In [41]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [42]:
clfs_lda = LinearDiscriminantAnalysis()

In [43]:
clfs_lda.fit(x,y)

In [44]:
y_predict_lda = clfs_lda.predict(x)
y_predict_lda

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [45]:
# Confusion matrix of Linear Discriminant Analysis

confusion_matrix(y,y_predict_lda)

array([[192,  84],
       [ 79, 151]])

In [63]:

# ----------------------------------------------------

In [47]:
# ----------------- Test-Train Split ----------------- 

# There are 2 type dataset 
#     - Training set
#     - Test set

# Training error :
#     Performance on the model on the previously seen data

# Test error :
#     Performance of model on the unseen data


# ------------ Test-Train Split Technique ------------

# 1. Validation Set Approach

#     * Random division of data to 2 parts
#     * Usual split is 80:20 (Training:Test)
#     * When to use - In case of large number of observations
    
    
# 2. Leave one out cross validation
    
#     * Leaving one observation everytime from training set

    
# 3. K-Fold validation

#     * Divide the data to K sets
#     * We will keep one testing and K-1 for training

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
#  by using test_size = 0.2 it will be a ratio 80:20

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

In [51]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(404, 19) (102, 19) (404,) (102,)


In [52]:
# object creation for the model

clf_LR = LogisticRegression()

In [53]:
clf_LR.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
y_test_predict = clf_LR.predict(x_test)

In [55]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [56]:
confusion_matrix(y_test,y_test_predict)

array([[38, 20],
       [16, 28]])

In [57]:
accuracy_score(y_test, y_test_predict)

0.6470588235294118

In [61]:

# -----------------------------------------------------------------------------------


In [60]:
# ------------------------------ K-Nearest Neighbour -------------------------------- 

#  K-NN is one of the most simplest ML algorithms based on Supervised Learning technique.
#  K-NN assumes that similarity b/w the new cases/data and available cases and put the new 
#  case into the category that is most similar to the available categories

#  K-NN Working Algorithm :
# 
#     Step 1 : Select number K of the neighbours.
#     Step 2 : Calculate Euclidean distance of K number of neighbors.
#     Step 3 : Take the K nearest neighbors as per the calculated Euclidean distance.
#     Step 4 : Among these K neighbors, count the number of the data points in each category 
#     Step 5 : Assign the new data points to that category for which the number of the neighbor is max
#     Step 6 : Our model is ready


In [65]:
from sklearn import preprocessing

In [66]:
scaler = preprocessing.StandardScaler().fit(x_train)

In [71]:
x_train_s = scaler.transform(x_train)
x_train_s

array([[ 0.49166434, -0.79090621, -0.36300899, ..., -0.66077206,
         1.32542701,  0.        ],
       [-0.0656791 , -0.0601719 , -1.15694367, ..., -0.66077206,
        -0.75447384,  0.        ],
       [ 0.52510495, -0.99221479, -1.03025196, ...,  1.51338117,
        -0.75447384,  0.        ],
       ...,
       [-0.34435082, -0.18698046, -0.02516445, ..., -0.66077206,
        -0.75447384,  0.        ],
       [-0.63416941, -0.67836362, -0.86132969, ..., -0.66077206,
         1.32542701,  0.        ],
       [ 0.09037706, -1.13170421, -0.33767065, ..., -0.66077206,
         1.32542701,  0.        ]])

In [68]:
scaler = preprocessing.StandardScaler().fit(x_test)

In [72]:
x_test_s = scaler.transform(x_test)
x_test_s

array([[ 3.46427198e-02, -1.26009787e+00, -7.59976330e-01, ...,
         1.51338117e+00, -7.54473836e-01,  0.00000000e+00],
       [ 3.08888478e+00,  1.09537107e+00,  6.84309089e-01, ...,
         1.51338117e+00, -7.54473836e-01,  0.00000000e+00],
       [ 7.92301952e-02,  4.28041039e-01,  1.73890573e-04, ...,
         1.51338117e+00, -7.54473836e-01,  0.00000000e+00],
       ...,
       [-4.00085165e-01, -9.60512654e-01, -8.19099125e-01, ...,
        -6.60772062e-01,  1.32542701e+00,  0.00000000e+00],
       [ 6.47720506e-01, -9.50442563e-02, -5.15039037e-01, ...,
         1.51338117e+00, -7.54473836e-01,  0.00000000e+00],
       [-2.77469608e-01, -4.16820968e-01, -2.53209516e-01, ...,
        -6.60772062e-01, -7.54473836e-01,  0.00000000e+00]])

In [73]:
from sklearn.neighbors import KNeighborsClassifier

In [74]:
clf_knn_1 = KNeighborsClassifier(n_neighbors=1)

In [75]:
clf_knn_1.fit(x_train_s, y_train)

In [76]:
confusion_matrix(y_test, clf_knn_1.predict(x_test_s))

array([[32, 26],
       [19, 25]])

In [77]:
accuracy_score(y_test, clf_knn_1.predict(x_test_s))

0.5588235294117647

In [78]:
# when neighbor is 3 
clf_knn_3 = KNeighborsClassifier(n_neighbors=3)

In [79]:
clf_knn_3.fit(x_train_s, y_train)

In [80]:
confusion_matrix(y_test, clf_knn_3.predict(x_test_s))

array([[29, 29],
       [23, 21]])

In [81]:
accuracy_score(y_test, clf_knn_3.predict(x_test_s))

0.49019607843137253

In [82]:
# ----- When there are more neighbors -----

In [83]:
from sklearn.model_selection import GridSearchCV

In [84]:
params = {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [85]:
grid_search_cv = GridSearchCV(KNeighborsClassifier(),params)

In [86]:
grid_search_cv.fit(x_train, y_train)

In [87]:
# find best data point/ parameter

grid_search_cv.best_params_

{'n_neighbors': 9}

In [89]:
optimised_KNN = grid_search_cv.best_estimator_