In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import seaborn as sns

##### Loading the Dataset

In [None]:
supervised_df=pd.read_csv('supervised_learning_dataset.csv')
supervised_df.head()

In [None]:
len(supervised_df)

In [None]:
supervised_df_columns=supervised_df.columns

##### Data Exploration

In [None]:
supervised_df.info(verbose=True)

In [None]:
def missing_check(i,data):
    print(data[i].isna().sum())
for i in supervised_df_columns:
    missing_check(i,supervised_df)

#### Visualisation of Dataset to get insights into data

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x=supervised_df['pca_clusters'],palette='rocket')
plt.xticks(rotation=270);
#plt.savefig('Fig_1.png',bbox_inches='tight')

`The above visualization shows that it is an imbalanced dataset`

In [None]:
missing_values=supervised_df.isna().sum()
missing_values

In [None]:
import warnings

warnings.filterwarnings('ignore')

###### Defining Features and Lables

In [None]:
np.random.seed(40) # To make results reproduceable
x=supervised_df.drop('pca_clusters',axis=1) #features
y=supervised_df['pca_clusters'] # Label

In [None]:
print(x.shape,y.shape)

In [None]:
y=y.to_frame('pca_clusters')
y

In [None]:
y.value_counts()

In [None]:
print(x.shape)
print(y.shape)

In [None]:
np.random.seed(40)
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test= train_test_split (x,y, test_size=0.2)

In [None]:
print(x_train.shape)
print(x_test.shape)
print('*********')
print(y_train.shape)
print(y_test.shape)

##### Plotting the labels of train validation and test split to get an insight wether the classes are balanced or not in different sets

In [None]:
fig,ax=plt.subplots(nrows=1,ncols=2,figsize=(15,5))
sns.countplot(x='pca_clusters',data=y_train,ax=ax[0],palette='rocket')
ax[0].set_title('Training_Labels')
sns.countplot(x='pca_clusters',data=y_test,ax=ax[1],palette='rocket').set(ylabel=None)
ax[1].set_title('Test_Labels');
#plt.savefig('Label_values.png')

### PreProcessing

##### Data Balancing, Duplicate Rows Removal and Feature Selection

##### Checking and  Removing Duplicate Rows

As Duplicate samples bring biasness in the DataSet so I am checking and Dropping the duplicate rows from the x_train.

In [None]:
print(len(x_train))
print(x_train.shape)

In [None]:
duplicated_check=x_train.duplicated()
duplicated_check[duplicated_check=='True']

There are no duplicate rows in a dataset 

###### Data Balancing

In [None]:
y_train.value_counts()
print(len(x_train))
print(len(y_train))

In [None]:
(y_train.value_counts()*100)/len(y_train)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot('pca_clusters',data=y_train,palette="rocket")
plt.title('Training set before Balancing')
#plt.savefig('Unbalanced_1.png')

In [None]:
np.random.seed(40)
#label_up_sample={1:15182 ,1:14245,0:7920,3:982}
from imblearn.over_sampling import SMOTE

SMOTEE = SMOTE()

x_train, y_train = SMOTEE.fit_resample(x_train, y_train)

`SMOTE` will try to upsampled the minority to the majority. In my case it will bring all samples to 23847 examples because I am choosing default strategy. Smoteing does not generate the replicated sample but instead it will generate new samples bases on the values of the previous one

In [None]:
#### After Balancing
print(x_train.shape,y_train.shape)
print(y_train.value_counts())

In [None]:
plt.figure(figsize=(8,6))
sns.countplot('pca_clusters',data=y_train,palette="rocket")
plt.title('Training set after Balancing');
#plt.savefig('balanced_1.png')

###### Modles Selection

In [None]:
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [None]:
models={
    'Support_vector_machine':SVC(),
    'KN_Classifeir':KNeighborsClassifier(),
    'Naive_B':GaussianNB(),
    'Logistic_Regre':LogisticRegression(),
}

In [None]:
random_States=[10,45,80,200,600]

In [None]:
np.random.seed(40)
for name,model in models.items():
    model.fit(x_train,y_train)
    print(f"{name} accuracy on train set is: {model.score(x_train, y_train) * 100}%")
    print(f"{name} accuracy on test set is: {model.score(x_test, y_test) * 100}%")
    print('**************************')

In [None]:
test_set_accuracy=[]

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
for i in random_States:
    clf=LogisticRegression(random_state=i)
    clf.fit(x_train,y_train)
    predics=clf.predict(x_test)
    accuracyy=accuracy_score(y_test,predics)
    test_set_accuracy.append(accuracyy)

In [None]:
test_set_accuracy

In [None]:
y_test_preds=clf.predict(x_test)

##### Base Line Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score, roc_curve,plot_confusion_matrix,f1_score,recall_score

`Model Accuracy`

In [None]:
clf=LogisticRegression(random_state=42)
clf.fit(x_train,y_train)
y_test_preds=clf.predict(x_test)

`Confusion Matrix`

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict. In essence, giving me an idea of where the model is getting confused.

In [None]:
confusion=confusion_matrix(y_test, y_test_preds)
confusion

In [None]:
def visualisation_conf_mat_by_seaborn(conf_mat):

    plt.figure(figsize=(4,4))
    sns.heatmap(conf_mat,
                annot=True,  
                cbar=False,
               fmt='.0f',)
    plt.title('Confusion Matrix Before Tunning Model (Test Set)')
    plt.xlabel('Predicted label')
    plt.ylabel('True label');
    #plt.savefig('New_Confusion_matrix_1.png')


In [None]:
visualisation_conf_mat_by_seaborn(confusion)

In [None]:
classi=classification_report(y_test,y_test_preds)
print(classi)

    Precision - Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0.
    Recall - Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0.
    F1 score - A combination of precision and recall. A perfect model achieves an F1 score of 1.0.
    Support - The number of samples each metric was calculated on.

###### Area Under Receiver Operating Characteristic (ROC) Curve

ROC curves are a comparison of true postive rate (tpr) versus false positive rate (fpr).

I am going here with the default values of threshold of Logistic Regression which is `0.5` because in a my dataset it is more important to know both failure and good . So I am not changing the threshold value and shifting it to give more weight to either side

In [None]:
y_test_proba_1=clf.predict_proba(x_test)
y_test_proba_1[0:10]

In [None]:
import scikitplot as skplt
from sklearn.metrics import plot_roc_curve
skplt.metrics.plot_roc_curve(y_test, y_test_proba_1,title='ROC Curves (Test set)',curves=( 'each_class'));
#plt.savefig('roc_1_new.png')

### Supervised Learning Accuracy Evaluation

###### 70_30_split

In [None]:
np.random.seed(40)
from sklearn.model_selection import train_test_split
x_train_70,x_test_30, y_train_70, y_test_30= train_test_split (x,y, test_size=0.3)

In [None]:
np.random.seed(40)
#label_up_sample={1:15182 ,1:14245,0:7920,3:982}
from imblearn.over_sampling import SMOTE

SMOTEE = SMOTE()

x_train_70, y_train_70 = SMOTEE.fit_resample(x_train_70, y_train_70)

In [None]:
#### After Balancing
print(x_train_70.shape,y_train_70.shape)
print(y_train_70.value_counts())

In [None]:
plt.figure(figsize=(8,6))
sns.countplot('pca_clusters',data=y_train_70,palette="rocket")
plt.title('Training set after Balancing');
#plt.savefig('balanced_1.png')

In [None]:
clf=LogisticRegression(random_state=35)
clf.fit(x_train_70,y_train_70)
predics_30=clf.predict(x_test_30)
accuracyy_30=accuracy_score(y_test_30,predics_30)

In [None]:
accuracyy_30

##### 50_50_SPLIT

In [None]:
np.random.seed(12)
from sklearn.model_selection import train_test_split
x_train_50,x_test_50, y_train_50, y_test_50= train_test_split (x,y, test_size=0.5)

In [None]:
np.random.seed(12)
#label_up_sample={1:15182 ,1:14245,0:7920,3:982}
from imblearn.over_sampling import SMOTE

SMOTEE = SMOTE()

x_train_50, y_train_50 = SMOTEE.fit_resample(x_train_50, y_train_50)

In [None]:
#### After Balancing
print(x_train_50.shape,y_train_50.shape)
print(y_train_50.value_counts())

In [None]:
clf=LogisticRegression(random_state=12)
clf.fit(x_train_50,y_train_50)
predics_50=clf.predict(x_test_50)
accuracyy_50=accuracy_score(y_test_50,predics_50)

In [None]:
accuracyy_50

###### 40_60_Split

In [None]:
np.random.seed(2)
from sklearn.model_selection import train_test_split
x_train_40,x_test_60, y_train_40, y_test_60= train_test_split (x,y, test_size=0.6)

In [None]:
np.random.seed(12)
#label_up_sample={1:15182 ,1:14245,0:7920,3:982}
from imblearn.over_sampling import SMOTE

SMOTEE = SMOTE()

x_train_40, y_train_40 = SMOTEE.fit_resample(x_train_40, y_train_40)

In [None]:
clf=LogisticRegression(random_state=12)
clf.fit(x_train_40,y_train_40)
predics_60=clf.predict(x_test_60)
accuracyy_60=accuracy_score(y_test_60,predics_60)

In [None]:
accuracyy_60