#### Q2-A

In [14]:
import random
import warnings
import pandas as pd
from scipy.stats import mode
from sklearn.base import clone
from sklearn import preprocessing
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve

LogisticRegression_case = LogisticRegression()
dataset = load_breast_cancer()
x_dim,y_dim= dataset.data,dataset.target
# preprocessing added to increase accuracy
x_dim = (preprocessing.StandardScaler().fit(x_dim)).transform(x_dim)
# split ratio: 2:8
Xdim_for_train, Xdim_for_test, ydim_for_train, ydim_for_test = train_test_split(x_dim, y_dim, 
                                                                                shuffle=True,test_size=0.2)
warnings.filterwarnings('ignore')
hypothesis = dict(solver=['liblinear','lbfgs', 'newton-cg' ],penalty=['l1','l2'],
                      C=[40, 30, 20, 5, 4, 3, 2,1,0.01,0.2,0.3,0.4,0.5,0.6,0.8,0.9],
                      max_iter=[10,20,30,40,50,60,70,80,90,100,150,200,300,500,600,800,1000,1200,1400,1600,1800,2000])
GridSearchCV_report = GridSearchCV(estimator=LogisticRegression_case, param_grid=hypothesis, n_jobs=-1, 
                              scoring='accuracy',error_score=0).fit(Xdim_for_train, ydim_for_train)
print("Report:")
# print("Best Acc is :",GridSearchCV_report.best_score_)
print("max_iter: ",GridSearchCV_report.best_params_["max_iter"])
print("penalty: ",GridSearchCV_report.best_params_["penalty"])
print("solver: ",GridSearchCV_report.best_params_["solver"])
best_params_var_smoothing = 0.23101297
print("C: ",GridSearchCV_report.best_params_["C"])
print("_________________________________________________ \n")
LogisticRegression_model = LogisticRegression(max_iter=GridSearchCV_report.best_params_["max_iter"], 
                        penalty=GridSearchCV_report.best_params_["penalty"],
                        C=GridSearchCV_report.best_params_["C"], 
                        solver= GridSearchCV_report.best_params_["solver"]).fit(Xdim_for_train, ydim_for_train)
compare_value_confu = LogisticRegression_model.predict(Xdim_for_test)
print("Accuracy_score: ", accuracy_score(ydim_for_test, compare_value_confu))
print("confusion_matrix:")
print(confusion_matrix(ydim_for_test, compare_value_confu))

Report:
max_iter:  10
penalty:  l2
solver:  liblinear
C:  1
_________________________________________________ 

Accuracy_score:  0.9912280701754386
confusion_matrix:
[[37  1]
 [ 0 76]]


#### Q2-B

In [15]:
import sklearn.metrics as metrics
from sklearn.naive_bayes import GaussianNB

gaussian_naive_bayes = GaussianNB()
gaussian_naive_bayes.fit(Xdim_for_train, ydim_for_train)
compare_value = gaussian_naive_bayes.predict(Xdim_for_test)
confusion_matrix = metrics.confusion_matrix(ydim_for_test, compare_value)
accuracy_score,roc_auc_score = metrics.accuracy_score(ydim_for_test, compare_value),metrics.roc_auc_score(ydim_for_test, compare_value)
print("accuracy_score: ", accuracy_score, "/ roc_auc_score: ",roc_auc_score)
print("confusion_matrix: ")
print(confusion_matrix)

accuracy_score:  0.956140350877193 / roc_auc_score:  0.9342105263157895
confusion_matrix: 
[[33  5]
 [ 0 76]]


### Q2-C

In [16]:

from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve
import numpy as np

area_under_curve_array,models_array = [],[]
for i in range(0, 30):
    f_vector = Xdim_for_train[:, i]
    models_array.append(LogisticRegression())
    models_array[i].fit(f_vector.reshape(-1, 1), ydim_for_train)
    compare_value = models_array[i].predict(Xdim_for_test[:, i].reshape(-1, 1))
    false_psitive_count, true_psitive_count, extra = roc_curve(ydim_for_test, compare_value)
    area_under_curve_array.append([i, auc(false_psitive_count, true_psitive_count)])
area_under_curve_array = sorted(area_under_curve_array,key=lambda x: x[1], reverse=True)
# print(len(area_under_curve_array))
for i in area_under_curve_array:
    print(i)
area_under_curve_array = np.array(area_under_curve_array)
# print(area_under_curve_array.shape)
array_cut = area_under_curve_array[:, 0][:20]
# print(array_cut.shape)
important_features = Xdim_for_train[:,array_cut.astype(int)]
# print("?????????",important_features.shape)


[27, 0.9407894736842106]
[20, 0.9210526315789475]
[22, 0.9210526315789475]
[23, 0.9144736842105263]
[7, 0.9013157894736842]
[3, 0.8947368421052633]
[2, 0.888157894736842]
[0, 0.8815789473684209]
[13, 0.8552631578947368]
[6, 0.8552631578947367]
[10, 0.8289473684210527]
[26, 0.8223684210526316]
[5, 0.8026315789473684]
[12, 0.7960526315789475]
[25, 0.7894736842105262]
[24, 0.6644736842105263]
[17, 0.6578947368421053]
[28, 0.6578947368421053]
[8, 0.6447368421052633]
[4, 0.625]
[21, 0.625]
[1, 0.611842105263158]
[29, 0.5789473684210527]
[16, 0.5394736842105263]
[15, 0.5131578947368421]
[9, 0.5]
[11, 0.5]
[14, 0.5]
[18, 0.5]
[19, 0.4934210526315789]


### LogisticRegression_model

In [17]:
LogisticRegression_model = LogisticRegression(max_iter=GridSearchCV_report.best_params_["max_iter"], 
                        penalty=GridSearchCV_report.best_params_["penalty"],
                        C=GridSearchCV_report.best_params_["C"], 
                        solver= GridSearchCV_report.best_params_["solver"]).fit(important_features, ydim_for_train)

y_pred = LogisticRegression_model.predict(Xdim_for_test[:, array_cut.astype(int)])
print("LogisticRegression model: ")
print("shape: ",ydim_for_test.shape,y_pred.shape)
print(accuracy_score(ydim_for_test, y_pred))
print(confusion_matrix(ydim_for_test, y_pred))

LogisticRegression model: 
shape:  (114,) (114,)
0.9649122807017544
[[35  3]
 [ 1 75]]


### Gaussian_naive_bayes_model

In [18]:
gaussian_naive_bayes = GaussianNB(var_smoothing = best_params_var_smoothing).fit(
    important_features, ydim_for_train)
compare_val = gaussian_naive_bayes.predict(Xdim_for_test[:, array_cut.astype(int)])
confusion_matrix = confusion_matrix(ydim_for_test, compare_val)
print("accuracy_score: ", accuracy_score(ydim_for_test, compare_val))
print("confusion_matrix: ")
print(confusion_matrix)


accuracy_score:  0.9473684210526315
confusion_matrix: 
[[32  6]
 [ 0 76]]


### Q2-D

In [19]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve
import numpy as np
sampling_Xdim_for_train,sampling_Xdim_for_test = [],[]
prob_X_train,prob_X_test = [],[]
for i, model_present in enumerate(models_array):
    prophecy_prediction = model_present.predict_proba(Xdim_for_train[:, i].reshape(-1, 1))
    prophecy_prediction_testdim = model_present.predict_proba(Xdim_for_test[:, i].reshape(-1, 1))
    sampling_Xdim_for_train.append(prophecy_prediction[:, 0])
    sampling_Xdim_for_train.append(prophecy_prediction[:, 1])
    sampling_Xdim_for_test.append(prophecy_prediction_testdim[:, 0])
    sampling_Xdim_for_test.append(prophecy_prediction_testdim[:, 1])
train,test = np.array(sampling_Xdim_for_train).T,np.array(sampling_Xdim_for_test).T
# new dataset to train Gaussian and Multinomial Na√Øve Bayes
model_GaussianNB = GaussianNB()
current_model_Gaussuan = model_GaussianNB.fit(train, ydim_for_train)
compare_val = current_model_Gaussuan.predict(test)
matrix = confusion_matrix(ydim_for_test, compare_val)
print("GaussianNB \n", matrix)
print(f"accuracy:{accuracy_score(ydim_for_test, compare_val)}")
false_psitive_count, true_psitive_count, extra = roc_curve(ydim_for_test, compare_val)
area_under_curve = auc(false_psitive_count, true_psitive_count)
print("Area_Under_Curve:", area_under_curve)
print("_____________________________________________________________ \n ")
model_MultinomialNB = MultinomialNB()
current_model_Gaussuan = model_MultinomialNB.fit(train, ydim_for_train)
compare_val = current_model_Gaussuan.predict(test)
matrix = confusion_matrix(ydim_for_test, compare_val)
print("MultinomialNB \n", matrix)
accuracy = accuracy_score(ydim_for_test, compare_val)
print("ACC: ", accuracy)
false_psitive_count, true_psitive_count, extra = roc_curve(ydim_for_test, compare_val)
area_under_curve = auc(false_psitive_count, true_psitive_count)
print("Area_Under_Curve:", area_under_curve)

GaussianNB 
 [[36  2]
 [ 2 74]]
accuracy:0.9649122807017544
Area_Under_Curve: 0.9605263157894739
_____________________________________________________________ 
 
MultinomialNB 
 [[34  4]
 [ 1 75]]
ACC:  0.956140350877193
Area_Under_Curve: 0.9407894736842106
