# Q5. Use Naive bayes, K-nearest, and Decision tree classification algorithms and build classifiers. Divide the data set into training and test set. Compare the accuracy of the different classifiers under the following situations:
- ## 5.1 a) Training set = 75% Test set = 25% b) Training set = 66.6% (2/3rd of total), Test set = 33.3% 
- ## 5.2 Training set is chosen by i) hold out method ii) Random subsampling iii) Cross-Validation. Compare the accuracy of the classifiers obtained.
- ## 5.3 Data is scaled to standard format.

In [None]:
# importing the required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split , cross_val_score,KFold,StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
iris = load_iris()
X = iris.data # features
y = iris.target # labels

# Spliting the data into Training set = 75% & Test set = 25%
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=0)

# Splitting the data into Training set = 66.6% (2/3rd of total) & Test set =33.3%
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.33, random_state=0)


In [None]:
# initialize the models
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=5)
dt = DecisionTreeClassifier(max_depth=3)

###  5.1 a) Training set = 75% Test set = 25% b) Training set = 66.6% (2/3rd of total), Test set = 33.3%

In [None]:
# train and evaluate the models under different situations

# 5.1 a) Training set = 75% Test set = 25%
gnb.fit(X_train1, y_train1)
y_pred1 = gnb.predict(X_test1)
print("Naive Bayes accuracy for 75% training set: ", accuracy_score(y_test1, y_pred1))

knn.fit(X_train1, y_train1)
y_pred2 = knn.predict(X_test1)
print("KNN accuracy for 75% training set: ", accuracy_score(y_test1, y_pred2))

dt.fit(X_train1, y_train1)
y_pred3 = dt.predict(X_test1)
print("Decision Tree accuracy for 75% training set: ", accuracy_score(y_test1, y_pred3))

Naive Bayes accuracy for 75% training set:  1.0
KNN accuracy for 75% training set:  0.9736842105263158
Decision Tree accuracy for 75% training set:  0.9736842105263158


In [None]:
# 5.1 b) Training set = 66.6% (2/3rd of total), Test set =33.3%
gnb.fit(X_train2, y_train2)
y_pred4 = gnb.predict(X_test2)
print("Naive Bayes accuracy for 66.6% training set: ", accuracy_score(y_test2, y_pred4))

knn.fit(X_train2, y_train2)
y_pred5 = knn.predict(X_test2)
print("KNN accuracy for 66.6% training set: ", accuracy_score(y_test2, y_pred5))

dt.fit(X_train2, y_train2)
y_pred6 = dt.predict(X_test2)
print("Decision Tree accuracy for 66.6% training set: ", accuracy_score(y_test2, y_pred6))

Naive Bayes accuracy for 66.6% training set:  0.96
KNN accuracy for 66.6% training set:  0.98
Decision Tree accuracy for 66.6% training set:  0.94


### 5.2  Training set is chosen by i) hold out method ii) Random subsampling iii) Cross-Validation. Compare the accuracy of the classifiers obtained.

In [None]:
# hold out method
X_train3 , X_test3 , y_train3 , y_test3 = train_test_split(X, y, test_size=0.20, random_state=42)

gnb.fit(X_train3, y_train3)
y_pred7 = gnb.predict(X_test3)
print("Naive Bayes accuracy for hold out method: ", accuracy_score(y_test3, y_pred7))

knn.fit(X_train3, y_train3)
y_pred8 = knn.predict(X_test3)
print("KNN accuracy for hold out method: ", accuracy_score(y_test3, y_pred8))

dt.fit(X_train3, y_train3)
y_pred9 = dt.predict(X_test3)
print("Decision Tree accuracy for hold out method: ", accuracy_score(y_test3, y_pred9))

Naive Bayes accuracy for hold out method:  1.0
KNN accuracy for hold out method:  1.0
Decision Tree accuracy for hold out method:  1.0


In [None]:
# Random subsampling method

sum_gnb = 0
sum_knn = 0
sum_dt = 0
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    sum_gnb += accuracy_score(y_test, y_pred)
    
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    sum_knn += accuracy_score(y_test, y_pred)
    
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    sum_dt += accuracy_score(y_test, y_pred)

print("Naive Bayes accuracy for random subsampling method: ", sum_gnb/10)
print("KNN accuracy for random subsampling method: ", sum_knn/10)
print("Decision Tree accuracy for random subsampling method: ", sum_dt/10)

Naive Bayes accuracy for random subsampling method:  0.9473684210526315
KNN accuracy for random subsampling method:  0.9631578947368421
Decision Tree accuracy for random subsampling method:  0.9526315789473683


In [None]:
# cross-validation
gnb_scores = cross_val_score(gnb, X, y, cv=10)
print("Naive Bayes accuracy for cross-validation: ", gnb_scores.mean())

knn_scores = cross_val_score(knn, X, y, cv=10)
print("KNN accuracy for cross-validation: ", knn_scores.mean())

dt_scores = cross_val_score(dt, X, y, cv=10)
print("Decision Tree accuracy for cross-validation: ", dt_scores.mean())

Naive Bayes accuracy for cross-validation:  0.9533333333333334
KNN accuracy for cross-validation:  0.9666666666666668
Decision Tree accuracy for cross-validation:  0.96


### 5.3 Data is scaled to standard format.

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

x_train4 , x_test4 , y_train4 , y_test4 = train_test_split(X_scaled, y, test_size=0.20, random_state=42)

gnb.fit(x_train4, y_train4)
y_pred10 = gnb.predict(x_test4)
print("Naive Bayes accuracy for scaled data: ", accuracy_score(y_test4, y_pred10))

knn.fit(x_train4, y_train4)
y_pred11 = knn.predict(x_test4)
print("KNN accuracy for scaled data: ", accuracy_score(y_test4, y_pred11))

dt.fit(x_train4, y_train4)
y_pred12 = dt.predict(x_test4)
print("Decision Tree accuracy for scaled data: ", accuracy_score(y_test4, y_pred12))

Naive Bayes accuracy for scaled data:  1.0
KNN accuracy for scaled data:  1.0
Decision Tree accuracy for scaled data:  1.0
