### IMPORT 

In [1]:
# Load libraries
import pandas
import os
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

### DATASET PATH

In [2]:
DATASET_PATH_IRIS      = os.path.join(os.path.abspath("Iris"),'iris.data')
DATASET_PATH_OD_TRAIN  = os.path.join(os.path.abspath("Occupency_Detection"),'datatraining.txt')
DATASET_PATH_OD_TEST_1 = os.path.join(os.path.abspath("Occupency_Detection"),'datatest.txt')
DATASET_PATH_OD_TEST_2 = os.path.join(os.path.abspath("Occupency_Detection"),'datatest2.txt')

### Q1 (ALL PARTS)

In [3]:
# a
print('\nA -> Load data set using pandas library\n')
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
df = pandas.read_csv(DATASET_PATH_IRIS, names=names)
print(df.head())

# b
print('\nB -> Print the size of data set\n')
print(df.shape)

# c
print('\nC -> Display the class distribution\n')
print(df.groupby('class').size())

#d
print('\nD -> Now, divide your data using hold out approach (80% for training and 20% for testing # train / test dataset\n')
array = df.values 
X = array[:,0:4] 
Y = array[:,4] 
t_size = 0.20 
seed = 7 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=t_size,  
random_state=seed) 
print('DONE')

#e
print('\nE-> Apply knn classifier. See the documentation below. You need to import necessary classes\n')
knn = KNeighborsClassifier() 
knn.fit(X_train, Y_train) 
predictions = knn.predict(X_test) 
print(accuracy_score(Y_test, predictions)) 
print(confusion_matrix(Y_test, predictions)) 
print(classification_report(Y_test, predictions)) 

#f
print('\nF -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy\n')
for i in range(1,11):
    knn = KNeighborsClassifier(n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


#g
print('\nG -> Repeat (e) by changing the value of seed (seed = 1, 2, 3, …. , 10). Print only accuracy\n')
for i in range(1,11):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=t_size,  
    random_state=i) 
    knn = KNeighborsClassifier(n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


A -> Load data set using pandas library

   sepal-length  sepal-width  petal-length  petal-width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

B -> Print the size of data set

(150, 5)

C -> Display the class distribution

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

D -> Now, divide your data using hold out approach (80% for training and 20% for testing # train / test dataset

DONE

E-> Apply knn classifier. See the documentation below. You need to import necessary classes

0.9
[[ 7  0  0]
 [ 0 11  1]
 [ 0  2  9]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-vers

### Q2 (PART A , B ,C)

In [4]:
#a
print('\nA -> Load data set using pandas library\n')
df1_train = pandas.read_csv(DATASET_PATH_OD_TRAIN)
df1_train = df1_train.drop(['date'], axis=1)

df1_test = pandas.read_csv(DATASET_PATH_OD_TEST_1)
df1_test = df1_test.drop(['date'], axis=1)

df1_test2 = pandas.read_csv(DATASET_PATH_OD_TEST_2)
df1_test2 = df1_test2.drop(['date'], axis=1)

print(df1_train.head(),end='\n\n')
print(df1_test.head(),end='\n\n')
print(df1_test2.head(),end='\n\n')
#b
print('\nB -> Print the size of data set\n')
print(df1_train.shape)

#c
print('\nC -> Display the class distribution\n')
print(df1_train.groupby('Occupancy').size())



A -> Load data set using pandas library

   Temperature  Humidity  Light     CO2  HumidityRatio  Occupancy
1        23.18   27.2720  426.0  721.25       0.004793          1
2        23.15   27.2675  429.5  714.00       0.004783          1
3        23.15   27.2450  426.0  713.50       0.004779          1
4        23.15   27.2000  426.0  708.25       0.004772          1
5        23.10   27.2000  426.0  704.50       0.004757          1

     Temperature  Humidity       Light         CO2  HumidityRatio  Occupancy
140      23.7000    26.272  585.200000  749.200000       0.004764          1
141      23.7180    26.290  578.400000  760.400000       0.004773          1
142      23.7300    26.230  572.666667  769.666667       0.004765          1
143      23.7225    26.125  493.750000  774.750000       0.004744          1
144      23.7540    26.200  488.600000  779.000000       0.004767          1

   Temperature   Humidity       Light          CO2  HumidityRatio  Occupancy
1      21.7600  31.13

### Q2 PART (E , F) USING df1_test

In [5]:
#d
Y_train = df1_train['Occupancy'] 
X_train = df1_train.drop(['Occupancy'], axis=1)

Y_test = df1_test['Occupancy'] 
X_test = df1_test.drop(['Occupancy'], axis=1)

#e
print('\nE-> Apply knn classifier. SeQ2 PART (E , F) USING df1_test2e the documentation below. You need to import necessary classes\n')
knn = KNeighborsClassifier() 
knn.fit(X_train, Y_train) 
predictions = knn.predict(X_test) 
print(accuracy_score(Y_test, predictions)) 
print(confusion_matrix(Y_test, predictions)) 
print(classification_report(Y_test, predictions))

#f
print('\nF -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy\n')
for i in range(1,11):
    knn = KNeighborsClassifier(n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


E-> Apply knn classifier. SeQ2 PART (E , F) USING df1_test2e the documentation below. You need to import necessary classes

0.9425891181988743
[[1645   48]
 [ 105  867]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1693
           1       0.95      0.89      0.92       972

    accuracy                           0.94      2665
   macro avg       0.94      0.93      0.94      2665
weighted avg       0.94      0.94      0.94      2665


F -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy

k=1, Accuracy -> 0.9365853658536586
k=2, Accuracy -> 0.9230769230769231
k=3, Accuracy -> 0.9350844277673546
k=4, Accuracy -> 0.9275797373358349
k=5, Accuracy -> 0.9425891181988743
k=6, Accuracy -> 0.9324577861163227
k=7, Accuracy -> 0.9609756097560975
k=8, Accuracy -> 0.9549718574108818
k=9, Accuracy -> 0.9617260787992495
k=10, Accuracy -> 0.9598499061913696


### Q2 PART (E , F) USING df1_test2

In [6]:
#d
Y_train = df1_train['Occupancy'] 
X_train = df1_train.drop(['Occupancy'], axis=1)

Y_test = df1_test2['Occupancy'] 
X_test = df1_test2.drop(['Occupancy'], axis=1)

#e
print('\nE-> Apply knn classifier. See the documentation below. You need to import necessary classes\n')
knn = KNeighborsClassifier() 
knn.fit(X_train, Y_train) 
predictions = knn.predict(X_test) 
print(accuracy_score(Y_test, predictions)) 
print(confusion_matrix(Y_test, predictions)) 
print(classification_report(Y_test, predictions))

#f
print('\nF -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy\n')

for i in range(1,11):
    knn = KNeighborsClassifier(n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


E-> Apply knn classifier. See the documentation below. You need to import necessary classes

0.9621616078753076
[[7385  318]
 [  51 1998]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      7703
           1       0.86      0.98      0.92      2049

    accuracy                           0.96      9752
   macro avg       0.93      0.97      0.95      9752
weighted avg       0.97      0.96      0.96      9752


F -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy

k=1, Accuracy -> 0.9502666119770303
k=2, Accuracy -> 0.9525225594749795
k=3, Accuracy -> 0.9579573420836751
k=4, Accuracy -> 0.9539581624282198
k=5, Accuracy -> 0.9621616078753076
k=6, Accuracy -> 0.9628794093519278
k=7, Accuracy -> 0.9649302707136997
k=8, Accuracy -> 0.9658531583264971
k=9, Accuracy -> 0.9656480721903199
k=10, Accuracy -> 0.9656480721903199


### Q3 

In [7]:
def ChiSquaredDistance(x,y):
     return sum(((x-y)**2)/(x+y))

### Q3 AS Q1 (USING Q1 DATASET)

In [8]:
# a
print('\nA -> Load data set using pandas library\n')
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
df = pandas.read_csv(DATASET_PATH_IRIS, names=names)
print(df.head())

# b
print('\nB -> Print the size of data set\n')
print(df.shape)

# c
print('\nC -> Display the class distribution\n')
print(df.groupby('class').size())

#d
print('\nD -> Now, divide your data using hold out approach (80% for training and 20% for testing # train / test dataset\n')
array = df.values 
X = array[:,0:4] 
Y = array[:,4] 
t_size = 0.20 
seed = 7 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=t_size,  
random_state=seed) 
print('DONE')

#e
print('\nE-> Apply knn classifier. See the documentation below. You need to import necessary classes\n')
knn = KNeighborsClassifier(metric=ChiSquaredDistance) 
knn.fit(X_train, Y_train) 
predictions = knn.predict(X_test) 
print(accuracy_score(Y_test, predictions)) 
print(confusion_matrix(Y_test, predictions)) 
print(classification_report(Y_test, predictions)) 

#f
print('\nF -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy\n')
for i in range(1,11):
    knn = KNeighborsClassifier(metric=ChiSquaredDistance,n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


#g
print('\nG -> Repeat (e) by changing the value of seed (seed = 1, 2, 3, …. , 10). Print only accuracy\n')
for i in range(1,11):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=t_size,  
    random_state=i) 
    knn = KNeighborsClassifier(metric=ChiSquaredDistance,n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


A -> Load data set using pandas library

   sepal-length  sepal-width  petal-length  petal-width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

B -> Print the size of data set

(150, 5)

C -> Display the class distribution

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

D -> Now, divide your data using hold out approach (80% for training and 20% for testing # train / test dataset

DONE

E-> Apply knn classifier. See the documentation below. You need to import necessary classes

0.9
[[ 7  0  0]
 [ 0 10  2]
 [ 0  1 10]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-vers

### Q3 AS Q2 PART (E , F) (USING Q2 df1_test)

In [9]:
#d
Y_train = df1_train['Occupancy'] 
X_train = df1_train.drop(['Occupancy'], axis=1)

Y_test = df1_test['Occupancy'] 
X_test = df1_test.drop(['Occupancy'], axis=1)

#e
print('\nE-> Apply knn classifier. SeQ2 PART (E , F) USING df1_test2e the documentation below. You need to import necessary classes\n')
knn = KNeighborsClassifier(metric=ChiSquaredDistance) 
knn.fit(X_train, Y_train) 
predictions = knn.predict(X_test) 
print(accuracy_score(Y_test, predictions)) 
print(confusion_matrix(Y_test, predictions)) 
print(classification_report(Y_test, predictions))

#f
print('\nF -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy\n')
for i in range(1,11):
    knn = KNeighborsClassifier(metric=ChiSquaredDistance,n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


E-> Apply knn classifier. SeQ2 PART (E , F) USING df1_test2e the documentation below. You need to import necessary classes



  
  


0.9335834896810506
[[1645   48]
 [ 129  843]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1693
           1       0.95      0.87      0.90       972

    accuracy                           0.93      2665
   macro avg       0.94      0.92      0.93      2665
weighted avg       0.93      0.93      0.93      2665


F -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy



  
  


k=1, Accuracy -> 0.9227016885553471


  
  


k=2, Accuracy -> 0.9050656660412758


  
  


k=3, Accuracy -> 0.924202626641651


  
  


k=4, Accuracy -> 0.9144465290806755


  
  


k=5, Accuracy -> 0.9335834896810506


  
  


k=6, Accuracy -> 0.9230769230769231


  
  


k=7, Accuracy -> 0.9530956848030019


  
  


k=8, Accuracy -> 0.9467166979362102


  
  


k=9, Accuracy -> 0.9654784240150094


  
  


k=10, Accuracy -> 0.9609756097560975


### Q3 AS Q2 PART (E , F) (USING Q2 df1_test1)

In [10]:
#d
Y_train = df1_train['Occupancy'] 
X_train = df1_train.drop(['Occupancy'], axis=1)

Y_test = df1_test2['Occupancy'] 
X_test = df1_test2.drop(['Occupancy'], axis=1)

#e
print('\nE-> Apply knn classifier. See the documentation below. You need to import necessary classes\n')
knn = KNeighborsClassifier(metric=ChiSquaredDistance) 
knn.fit(X_train, Y_train) 
predictions = knn.predict(X_test) 
print(accuracy_score(Y_test, predictions)) 
print(confusion_matrix(Y_test, predictions)) 
print(classification_report(Y_test, predictions))

#f
print('\nF -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy\n')

for i in range(1,11):
    knn = KNeighborsClassifier(metric=ChiSquaredDistance,n_neighbors=i) 
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test) 
    print('k='+str(i)+', Accuracy -> '+str(accuracy_score(Y_test, predictions)))


E-> Apply knn classifier. See the documentation below. You need to import necessary classes



  
  


0.9607260049220673
[[7410  293]
 [  90 1959]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      7703
           1       0.87      0.96      0.91      2049

    accuracy                           0.96      9752
   macro avg       0.93      0.96      0.94      9752
weighted avg       0.96      0.96      0.96      9752


F -> Repeat (e) by changing the value of k (k=1, 2, 3,…., 10). Print only accuracy



  
  


k=1, Accuracy -> 0.9511894995898277


  
  


k=2, Accuracy -> 0.9521123872026251


  
  


k=3, Accuracy -> 0.9558039376538146


  
  


k=4, Accuracy -> 0.9563166529942576


  
  


k=5, Accuracy -> 0.9607260049220673


  
  


k=6, Accuracy -> 0.9597005742411813


  
  


k=7, Accuracy -> 0.9609310910582445


  
  


k=8, Accuracy -> 0.9606234618539786


  
  


k=9, Accuracy -> 0.963084495488105


  
  


k=10, Accuracy -> 0.963084495488105
