# Importing all necessary libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
# Importing dataset 
data = pd.read_csv('./Data_cardiovascular_risk.csv')
df = data.copy()

# Data Exploration

In [None]:
df.shape

In [None]:
df.columns.to_list()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe(include='all')

# Data Preprocessing

> We will drop `education` and `id` columns because it has nothing to do with heart disease

In [None]:
df.drop(['id','education'], axis=1, inplace=True)

In [None]:
df.head()

## We have to convert all the string values into int.

In [None]:
df['sex'] = df['sex'].apply(lambda x: 1 if x == 'M' else 0)
df['is_smoking'] = df['is_smoking'].apply(lambda x: 1 if x == 'YES' else 0)

# Checking the distributions

In [None]:
df['sex'].value_counts()

In [None]:
df['is_smoking'].value_counts()

In [None]:
df.info()

# Taking care of Missing Values

In [None]:
df.isnull().sum()

In [None]:
# visualize missing values by using missingno library
import missingno as msno

# Visualize missing values as a matrix
msno.matrix(df,figsize=(12, 5))

In [None]:
# Visualize the correlation between the number of missing values in different columns as a heatmap
msno.heatmap(df,figsize=(8, 5))

# % of Missing data in each feature

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent_total = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100
missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missingdf = missing[missing['Total']>0]
print(missingdf)

# Visualize the % of Missing data in each feature

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x=missingdf.index, y=missingdf['Percentage'], data = missingdf)
plt.title('Percentage of missing data by feature')
plt.xlabel('Features', fontsize=14)
plt.ylabel('Percentage', fontsize=14)
plt.show()

# Let's Count the rows which have missing data and get % out of it

In [None]:
count=0
for i in df.isnull().sum(axis=1):
    if i>0:
        count=count+1
print('Total number of rows with missing values is ', count)
# checking missing value percentage 
print("% of rows which have missing data: ",round((count/len(df.index))*100), "%")

> we can drop the missing data

In [None]:
df.dropna(axis=0,inplace=True)

# checking if there are any missing values left
df.isnull().sum()

> Checking for Any duplicates

In [None]:
len(df[df.duplicated()])

> Statistics of dataset after pre-processing

In [None]:
df.describe()

# Visualizing Data Distribution

In [None]:
fig = plt.figure(figsize=(15, 10))
ax = fig.gca()
df.hist(ax = ax)
plt.show()

> Plotting pie chart for TenYearCHD

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18, 8))
df['TenYearCHD'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('TenYearCHD')
ax[0].set_ylabel('')
sns.countplot(x='TenYearCHD', data=df, ax=ax[1])
ax[1].set_title('TenYearCHD')
plt.show()

> Let's Visualize the target and age variable

In [None]:
plt.figure(figsize=(12, 6))
plt.title('No. of people with and without cardiovascular disease')
sns.countplot(x='age',hue= 'TenYearCHD' ,data=df, palette='colorblind', edgecolor=sns.color_palette('dark', n_colors=1)[0])

* People with `Highest risk` of developing heart disease are between `51 - 63`

# Categorical Variable Comparisions with Target Variable - `TenYearCHD`

We will use **Stacked Bar Chart**

In [None]:
from operator import add
def stacked_barchart(data, title = None, ylabel = None, xlabel = None):
  # Function to plot stacked bar chart
    default_colors = ['#006400', '#FF0000', '#228B22']
    # From raw value to percentage
    totals = data.sum(axis=1)
    bars = ((data.T / totals) * 100).T
    r = list(range(data.index.size))

    #Plot
    barWidth = 0.85
    names = data.index.tolist()
    bottom = [0] * bars.shape[0]

    # Create bars
    color_index = 0
    plots = []
    for bar in bars.columns:
        plots.append(plt.bar(r, bars[bar], bottom=bottom, color=default_colors[color_index], edgecolor='white', width=barWidth))
        bottom = list(map(add, bottom, bars[bar]))
        color_index = 0 if color_index >= len(default_colors) else color_index + 1

    # Custom x axis
    plt.title(title)
    plt.xticks(r, names)
    plt.xlabel(data.index.name if xlabel is None else xlabel)
    plt.ylabel(data.columns.name if ylabel is None else ylabel)
    ax = plt.gca()

    y_labels = ax.get_yticks()
    ax.set_yticklabels([str(y) + '%' for y in y_labels])

    flat_list = [item for sublist in data.T.values for item in sublist]
    for i, d in zip(ax.patches, flat_list):
        data_label = str(d) + " (" + str(round(i.get_height(), 2)) + "%)"
        ax.text(i.get_x() + 0.45, i.get_y() + 5 , data_label, ha='center', va='bottom', fontdict=dict(color='black', size=20))

    for item in ([ax.title]):
        item.set_fontsize(27)
    
    for item in ([ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(24)

    legend = ax.legend(plots, bars.columns.tolist(), ncol=2, fancybox=True)
    plt.setp(legend.get_texts(), fontsize='20')

> Let's Visualize each category with respect to Target Vsriable - TenYearCHD

In [None]:
# Visualzing each category with respect to target variable
fig = plt.gcf()
fig.set_size_inches(27, 35)
grid_rows = 3
grid_cols = 2

#Plot sex vs disease outcome
plt.subplot(grid_rows, grid_cols, 1)
temp = df[['sex','TenYearCHD']].groupby(['sex','TenYearCHD']).size().unstack('TenYearCHD')
temp.rename(index={0:'Female', 1:'Male'}, columns={0:'No Disease', 1:'Has Disease'}, inplace = True)
stacked_barchart(temp, title = 'Cardiovascular heart disease vs Sex', ylabel = 'Population')

#Plot smoking satus vs disease outcome
plt.subplot(grid_rows, grid_cols, 2)
temp = df[['is_smoking','TenYearCHD']].groupby(['is_smoking','TenYearCHD']).size().unstack('TenYearCHD')
temp.rename(index={0:'Not a Smoker', 1:'Smoker'}, columns={0:'No Disease', 1:'Has Disease'}, inplace = True)
stacked_barchart(temp, title = 'Cardiovascular heart disease vs Smoking', ylabel = 'Population')

#Plot diabetes vs disease outcome
plt.subplot(grid_rows, grid_cols, 3)
temp = df[['diabetes','TenYearCHD']].groupby(['diabetes','TenYearCHD']).size().unstack('TenYearCHD')
temp.rename(index={0:'Not Diabetic', 1:'Diabetic'}, columns={0:'No Disease', 1:'Has Disease'}, inplace = True)
stacked_barchart(temp, title = 'Cardiovascular heart disease vs Diabetes', ylabel = 'Population')

#Plot BP meds vs disease outcome
plt.subplot(grid_rows, grid_cols, 4)
temp = df[['BPMeds','TenYearCHD']].groupby(['BPMeds','TenYearCHD']).size().unstack('TenYearCHD')
temp.rename(index={0:'Not on medication', 1:'On Medication'}, columns={0:'No Disease', 1:'Has Disease'}, inplace = True)
stacked_barchart(temp, title = 'Cardiovascular heart disease vs BP meds', ylabel = 'Population')

#Plot Hypertension vs disease outcome
plt.subplot(grid_rows, grid_cols, 5)
temp = df[['prevalentHyp','TenYearCHD']].groupby(['prevalentHyp','TenYearCHD']).size().unstack('TenYearCHD')
temp.rename(index={0:'Not Hypertensive', 1:'Hypertensive'}, columns={0:'No Disease', 1:'Has Disease'}, inplace = True)
stacked_barchart(temp, title = 'Cardiovascular heart disease vs Hypertension', ylabel = 'Population')

From the above categorical variables comparison plot we can conclude that,

*   Slightly more males are suffering from Cardiovascular heart disease than females.
*   The people who have Cardiovascular heart disease is almost equal between smokers and non smokers.
*   The percentage of people who have Cardiovascular heart disease is higher among the diabetic patients and also those patients with prevalent hypertension have more risk of Cardiovascular heart disease compare to those who don't have hypertensive problem.
*   The percentage of people who are on medication of blood pressure have more risk of Cardiovascular heart disease compare to those who are not on medication.

# Let's see the Correlation between the all features using heatmap

In [None]:
plt.figure(figsize=(20,8))
correlation = df.corr()
sns.heatmap(abs(correlation), annot = True, cmap='YlGnBu')
plt.title('Correlation between the all features')

**From the above correlation plot we can conclude that**,

*   There are no features with more than 0.2 correlation with the Ten year risk of developing CHD and this shows that the features a poor predictors. However the features with the highest correlations are age, prevalent hypertension(prevalentHyp) and systolic blood pressure(sysBP).

*   Also there are a couple of features that are highly correlated with each other and it makes no sense to use both of them in building a machine learning model. 

**These includes:** 

*  Blood glucose and diabetes;
*  systolic and diastolic blood pressures;
*  cigarette smoking and the number of cigarretes smoked per day. 

Therefore we need to carry out feature selection to pick the best features. 

# Feature Selection

**Tree-based: SelectFromModel**

**SelectFromModel** is an Embedded method. Embedded methods use algorithms that have built-in feature selection methods.

Here,

We have used RandomForest() to select features based on feature importance.
We calculate feature importance using node impurities in each decision tree. 

In Random forest, the final feature importance is the average of all decision tree feature importance.

In [None]:
# Define the features
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# define SelectFromModel feature selection method
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=14)
embeded_rf_selector.fit(x, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = x.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

In [None]:
# Important or Top Features
embeded_rf_feature

> Statistics on Top features

In [None]:
import statsmodels.api as sm

# Splitting the dependent and independent vatriables
top_features = df[embeded_rf_feature]
y = df['TenYearCHD']

result = sm.Logit(y, top_features).fit()
print(result.summary())

> Checking the `odds radio` of `top features`

In [None]:
params = result.params
conf = result.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
print(np.exp(conf))

> Getting `Cardiovacular Heart disease` risk increases with about `2%` for every increase in `age` and `sysBP`

# Pair Plots

In [None]:
sns.pairplot(df, hue = 'TenYearCHD', markers=["o", "s"], vars = embeded_rf_feature, palette='bright')

# Modelling and Predicting with ML models

**SMOTE** algorithm works in 4 simple steps:

* Choose a minority class as the input vector
* Find its k nearest neighbors (k_neighbors is specified as an argument in the SMOTE() function)
* Choose one of these neighbors and place a synthetic point anywhere on the line joining the point under consideration and its chosen neighbor
* Repeat the steps until data is balanced

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X = df[embeded_rf_feature]
y = df.iloc[:,-1]
# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

print('Original dataset shape', len(df))
print('Resampled dataset shape', len(y_smote))

In [None]:
from collections import Counter
labels = ["Negative Cases","Positive Cases"]
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
sns.barplot(x=labels, y=list(dict(Counter(y)).values()))
plt.title("Numbers Before Balancing")
plt.subplot(1,2,2)
sns.barplot(x=labels,y= list(dict(Counter(y_smote)).values()))
plt.title("Numbers After Balancing")
plt.show()

# Splitting the data to Training and Testing sets


In [None]:
# first create our new dataset

df_new = pd.concat([pd.DataFrame(x_smote), pd.DataFrame(y_smote)], axis=1)
df_new.columns = ['age', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose','TenYearCHD']
df_new.head()

In [None]:
x_new = df_new[embeded_rf_feature]
y_new = df_new["TenYearCHD"]

X_train,X_test,Y_train,Y_test = train_test_split(x_new,y_new,test_size=0.2,random_state=42)
print("Training features have {0} records and Testing features have {1} records.".\
      format(X_train.shape[0], X_test.shape[0]))

# Models:

1. Logistic Regression
2. Random Forest
3. XGBoost
4. Support Vector Machine

In [None]:
# importing Required Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier
from sklearn.svm import SVC


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score,precision_score,classification_report,roc_auc_score,roc_curve

# 1. Logistic Regression

> Using gridsearch for optimum parameters

In [None]:
params = {'penalty':['l1','l2'],
         'C' : [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1,10,100,1e-3,1e+4,1e+5,1e+6],
         'class_weight':['balanced',None]}
logistic_clf = GridSearchCV(LogisticRegression(),param_grid=params,cv=10, scoring='roc_auc')

> Training the classifier

In [None]:
logistic_clf.fit(X_train,Y_train)

logistic_clf.best_params_

> Making Predictions

In [None]:
logistic_predict = logistic_clf.predict(X_test)
logistic_accuracy = accuracy_score(Y_test,logistic_predict)
print(f"Using logistic regression we get an accuracy of {round(logistic_accuracy*100,2)}%")

In [None]:
print('Train ROC-AUC score : ', logistic_clf.best_estimator_.score(X_train,Y_train))
print('Test ROC-AUC score : ', logistic_clf.best_estimator_.score(X_test,Y_test))

> Consfusion Matrix for `Logistic Model`

In [None]:
cm=confusion_matrix(Y_test,logistic_predict)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
print(classification_report(Y_test,logistic_predict))

In [None]:
# ROC curve and AUC
probs = logistic_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
log_auc = roc_auc_score(Y_test, probs)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(Y_test, probs)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(log_auc,3)}")
plt.show()

# 2. Random Forest Classifier

> Using gridsearch for Optimum Parameters

In [None]:
params_rf = {
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [40, 50],
    'min_samples_split': [50, 100, 150],
    'n_estimators': [50, 80, 100]
  }

random_clf = GridSearchCV(RandomForestClassifier(),param_grid=params_rf,cv=10, scoring='roc_auc')

> Training the classifiers

In [None]:
random_clf.fit(X_train,Y_train)

random_clf.best_params_

> Making Predictions

In [None]:

random_predict = random_clf.predict(X_test)

In [None]:
random_accuracy = accuracy_score(Y_test,random_predict)
print(f"Using Random Forest we get an accuracy of {round(random_accuracy*100,2)}%")

> Confusion Matrix for `Random Forest Classifier`

In [None]:
cm=confusion_matrix(Y_test,random_predict)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
print(classification_report(Y_test,random_predict))

In [None]:
# ROC curve and AUC 
probs1 = random_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs1 = probs1[:, 1]
# calculate AUC
ran_auc = roc_auc_score(Y_test, probs1)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(Y_test, probs1)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(ran_auc,3)}")
plt.show()

# 3. XGBoost

> Using gridSearch for optimum Parameters

In [None]:
params_xgb = {
    'max_depth': range (2, 12, 1),
    'n_estimators': range(60, 220, 20),
    'learning_rate': [0.1, 0.05, 0.01, 0.005]
  }

xgb_clf = GridSearchCV(XGBClassifier(), param_grid = params_xgb, cv = 10, scoring='roc_auc')
                      

> Training the Classifier

In [None]:
xgb_clf.fit(X_train,Y_train)

xgb_clf.best_params_

> Making Predictions

In [None]:
xgb_predict = xgb_clf.predict(X_test)
xgb_accuracy = accuracy_score(Y_test,xgb_predict)
print(f"Using XG boost we get an accuracy of {round(xgb_accuracy*100,2)}%")

> Confusion Matrix for XG Boost Classifier

In [None]:
cm=confusion_matrix(Y_test,xgb_predict)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
print(classification_report(Y_test, xgb_predict))

In [None]:
# ROC curve and AUC 
probs2 = xgb_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs2 = probs2[:, 1]
# calculate AUC
xgb_auc = roc_auc_score(Y_test, probs2)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(Y_test, probs2)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(xgb_auc,3)}")
plt.show()

# 4. Support Vector Machine

> Using GridSearch for Optimum Parameters

In [None]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svm_clf = GridSearchCV(SVC(kernel='rbf', probability=True), param_grid, cv=10)

> Training the classifier

In [None]:
svm_clf1 = svm_clf.fit(X_train,Y_train)

svm_clf.best_params_

> Making Predictions

In [None]:
svm_predict = svm_clf.predict(X_test)
svm_accuracy = accuracy_score(Y_test,svm_predict)
print(f"Using Support Vector Machine we get an accuracy of {round(svm_accuracy*100,2)}%")

> Confusion Matrix

In [None]:
cm=confusion_matrix(Y_test,svm_predict)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

In [None]:
print(classification_report(Y_test, svm_predict))


In [None]:
# ROC curve and AUC 
probs3 = svm_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs3 = probs3[:, 1]
# calculate AUC
svc_auc = roc_auc_score(Y_test, probs3)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(Y_test, probs3)
# plot curve
sns.set_style('whitegrid')
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fpr, tpr, marker='.')
plt.ylabel('True positive rate')
plt.xlabel('False positive rate')
plt.title(f"AUC = {round(svc_auc,3)}")
plt.show()

# Comparing All the models

In [None]:
Performance_df = pd.DataFrame({
    "Logistic regression":{'Test Accuracy':round(logistic_accuracy, 2),'Precision': round(precision_score(Y_test, logistic_predict), 2),'Recall': round(recall_score(Y_test, logistic_predict), 2),'F1 Score': round(f1_score(Y_test, logistic_predict), 2), 'AUC':round(log_auc, 2)},
    "Random Forest":{'Test Accuracy':round(random_accuracy, 2),'Precision': round(precision_score(Y_test, random_predict), 2),'Recall': round(recall_score(Y_test, random_predict), 2),'F1 Score': round(f1_score(Y_test, random_predict), 2), 'AUC':round(ran_auc, 2)},
    "XG Boost":{'Test Accuracy':round(xgb_accuracy, 2),'Precision': round(precision_score(Y_test, xgb_predict), 2),'Recall': round(recall_score(Y_test, xgb_predict), 2),'F1 Score': round(f1_score(Y_test, xgb_predict), 2), 'AUC':round(xgb_auc, 2)},
    "Support vector machine":{'Test Accuracy':round(svm_accuracy, 2),'Precision': round(precision_score(Y_test, svm_predict), 2),'Recall': round(recall_score(Y_test, svm_predict), 2),'F1 Score': round(f1_score(Y_test, svm_predict), 2), 'AUC':round(svc_auc, 2)}
}).T
Performance_df

> Since, **Support Vector Machine model** gives highest F score and AUC score. we will save this model to predict the disease

> Let's plot the accuracy and AUC score of each model

In [None]:
# Storing accuracies of each algorithm in a list
scores = [logistic_accuracy,random_accuracy,xgb_accuracy,svm_accuracy]
# Naming the algorithms and storing in a list
algorithms = ["Logistic Regression","Random Forest","XG Boost","Support vector machine"] 
# Visualize the algorithms
sns.set(rc={'figure.figsize':(8,6)})
plt.xlabel("Algorithms")
plt.ylabel("Accuracy score")
sns.barplot(x=algorithms,y=scores)

In [None]:
# Storing AUC score of each algorithm in a list
auc_scores = [log_auc,ran_auc,xgb_auc,svc_auc]
# Naming the algorithms and storing in a list
algorithms = ["Logistic Regression","Random Forest","XG Boost","Support vector machine"] 
# Visualize the algorithms
sns.set(rc={'figure.figsize':(8,6)})
plt.xlabel("Algorithms")
plt.ylabel("AUC score")
sns.barplot(x=algorithms,y=auc_scores)

> From both the graphs we can say that the best performing model is **Support Vector Machine** algorithm.

# **Conclusion:**
* The people who have Cardiovascular heart disease is almost equal between smokers and non smokers.
* The top features in predicting the ten year risk of developing Cardiovasular Heart Disease are **'age', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose'**.
* The Support vector machine with the radial kernel is the best performing model in terms of accuracy and the F1 score and Its high AUC-score shows that it has a high true positive rate.
* Balancing the dataset by using the SMOTE technique helped in improving the models' sensitivity.
* With more data(especially that of the minority class) better models can be built.

> Let us save the `Support Vector Machine` model to use it furthur

In [None]:
import pickle
pickle.dump(svm_clf1,open('model.pkl','wb'))