In [None]:
#importing the basic libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import missingno as msno
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

#setting the style of graphs
sns.set(style="darkgrid")

#I always prefer to see all columns
pd.set_option("display.max_columns", None)

In [None]:
#loading the dataset
na_vals=["NA","Missing", "NaN", "None"]
df = pd.read_csv("../input/water-potability/water_potability.csv", na_values=na_vals)
df.head() #take a look at the first 5 lines to get familiar with the dataset

In [None]:
print("Shape: {} \n".format(df.shape)) #check the number of rows and columns
df.info() #check features type and missing values

In [None]:
# graphic check of missing values
# df.isnull().sum().any()
msno.matrix(df, color=" 0.1")
plt.show()

In [None]:
#dataset description
df.describe()

In [None]:
#create lists for each type of feature to study their correlations

#continuos numerical features:
num = ["ph", "Hardness", "Solids", "Chloramines", "Sulfate", 
       "Conductivity", "Organic_carbon", "Trihalomethanes", "Turbidity"]

#ordinal categorical features:
cat_nom=[]
for i in df.columns:
    if i not in num and df[i].dtype == "int64":
        cat_nom.append(i)
        
print("Continuos numerical features:\n{} \nTot: {}".format(num,len(num)))
print("Nominal categorical features:\n{}\nTot: {}".format(cat_nom,len(cat_nom)))

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
impute_it = IterativeImputer(random_state=42, initial_strategy="median")
imputed = impute_it.fit_transform(df)
df_imputed = pd.DataFrame(imputed, columns=df.columns)
df_imputed.head()

In [None]:
msno.matrix(df_imputed, color=" 0.1")
plt.show()

In [None]:
d= pd.DataFrame(df_imputed['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
            color_discrete_sequence=["red", "blue"],
             labels={'label':'Potability','Potability':'No. Of Samples'})

fig.add_annotation(text='We can resample the data<br> to get a balanced dataset',
                   x=1.2,y=0.9,showarrow=False,font_size=12,opacity=0.7,font_family='monospace')
fig.add_annotation(text='Potability',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')

fig.update_layout(
    font_family='monospace',
    title=dict(text='Water potability',x=0.47,y=0.98,
               font=dict(color= "black",size=20)),
    legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))

fig.update_traces(textposition='outside', textinfo='percent+label')

fig.show()

In [None]:
#drawing boxplots of the  numerical features in order to see ouliers

i = 1
plt.figure(figsize=(30,15))
for feature in num:
    plt.subplot(3,3,i)
    sns.boxplot(data=df_imputed, x=feature)
    plt.tight_layout()
    i+=1

In [None]:
#import library
from sklearn.svm import OneClassSVM
# identify outliers 
OcSVM = OneClassSVM(nu=0.05)
y_outliers = OcSVM.fit_predict(df_imputed)

#Find score and anomaly value
df_imputed['scores']= OcSVM.decision_function(df_imputed)
df_imputed['anomaly_Value']= y_outliers
df_imputed.head(10) #After adding the scores and anomalies for all the rows in the complete dataset, it will print the predicted anomalies.

In [None]:
df_imputed[df_imputed["anomaly_Value"]== -1] # tables with the predicted outliers

In [None]:
#Total percentage of outliers detected
df_imputed=pd.DataFrame(df_imputed)
print("Percentage of anomalies(outliers) in the dataset: {:.2f}".format((len(df_imputed.loc[df_imputed["anomaly_Value"]== -1])/len(df_imputed))*100))

In [None]:
#droping outliers rows
for i in range(len(y_outliers)):
    if y_outliers[i] == -1:
        df_imputed.drop(i,inplace = True)

#checking new shape
df_imputed.shape

In [None]:
df_imputed.drop(columns=["scores","anomaly_Value"], inplace=True)#erase two useless columns
#create a table with the correlation values
#the 'corr()' method uses Pearson's coefficient by default to study the covariance between features
fig = px.imshow(round(df_imputed.corr(),2), text_auto=True, title="Correlations between features",
                labels=dict(color="Correlation"), color_continuous_scale=px.colors.sequential.RdBu_r, 
                color_continuous_midpoint=0)
fig.update_layout(autosize=False, width=800, height=800)
fig.show()

In [None]:
#creating a unique plot with the regressions
g = sns.PairGrid(df_imputed,diag_sharey=False, corner=True)
g.map_offdiag(sns.regplot, scatter_kws={'alpha':0.15}, line_kws={'color': 'red'})
g.map_diag(sns.kdeplot)
plt.show()

In [None]:
df_imputed.groupby('Potability').mean()

In [None]:
#t-test for each variable
not_pot = df_imputed.Potability == 0
pot = df_imputed.Potability == 1
variable1 = df_imputed.ph
variable2 = df_imputed.Hardness
variable3 = df_imputed.Solids
variable4 = df_imputed.Chloramines
variable5 = df_imputed.Sulfate
variable6 = df_imputed.Conductivity
variable7 = df_imputed.Organic_carbon
variable8 = df_imputed.Trihalomethanes
variable9 = df_imputed.Turbidity

from scipy import stats
t_value1,p_value1=stats.ttest_ind(variable1[not_pot], variable1[pot], axis=0, equal_var=False)
t_value2,p_value2=stats.ttest_ind(variable2[not_pot], variable2[pot], axis=0, equal_var=False)
t_value3,p_value3=stats.ttest_ind(variable3[not_pot], variable3[pot], axis=0, equal_var=False)
t_value4,p_value4=stats.ttest_ind(variable4[not_pot], variable4[pot], axis=0, equal_var=False)
t_value5,p_value5=stats.ttest_ind(variable5[not_pot], variable5[pot], axis=0, equal_var=False)
t_value6,p_value6=stats.ttest_ind(variable6[not_pot], variable6[pot], axis=0, equal_var=False)
t_value7,p_value7=stats.ttest_ind(variable7[not_pot], variable7[pot], axis=0, equal_var=False)
t_value8,p_value8=stats.ttest_ind(variable8[not_pot], variable8[pot], axis=0, equal_var=False)
t_value9,p_value9=stats.ttest_ind(variable9[not_pot], variable9[pot], axis=0, equal_var=False)


print('pH: t statistic %0.3f p-value %0.3f' %(t_value1, p_value1))
print('Hardness: t statistic %0.3f p-value %0.3f' %(t_value2, p_value2))
print('Solids: t statistic %0.3f p-value %0.3f' %(t_value3, p_value3))
print('Chloramines: t statistic %0.3f p-value %0.3f' %(t_value4, p_value4))
print('Sulfate: t statistic %0.3f p-value %0.3f' %(t_value5, p_value5))
print('Conductivity: t statistic %0.3f p-value %0.3f' %(t_value6, p_value6))
print('Organic_carbon: t statistic %0.3f p-value %0.3f' %(t_value7, p_value7))
print('Trihalomethanes: t statistic %0.3f p-value %0.3f' %(t_value8, p_value8))
print('Turbidity: t statistic %0.3f p-value %0.3f' %(t_value9, p_value9))

In [None]:
df_imputed.tail(8)

In [None]:
#features
X = df_imputed.drop("Potability", axis=1)

#label
# y = df.iloc[:,-1]
y = df_imputed.loc[:, "Potability"]

In [None]:
#loading all the sklearn modules I need

#data preparation:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as split

import warnings
warnings.filterwarnings("ignore")


#splitting
X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state= 0)
print(X_train.shape)
print(y_train.shape)

#normalizing features
scaler = StandardScaler() #calling the scaler 
#fitting the features to scale them
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#loading all the sklearn modules I need
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

#mutual information selecting all features
mutual = SelectKBest(score_func=mutual_info_classif, k='all')

#learn relationship from training data
mutual.fit(X_train, y_train)

# transform train input data
X_train_mut = mutual.transform(X_train)

# transform test input data
X_test_mut = mutual.transform(X_test)

#printing scores of the features
for i in range(len(mutual.scores_)):
    print('Feature %d: %f' % (i, mutual.scores_[i]))

In [None]:
# I create a df with the test scores
mutual_score = pd.DataFrame(mutual.scores_, index=df_imputed.drop(columns=["Potability"]).columns, columns=["Mutual_Score"])
mutual_score = mutual_score.sort_values(by="Mutual_Score", ascending=False)

# I visualise with a bar graph which features are most influential
sns.set()
plt.figure(figsize=(10, 4))
plt.bar(x=mutual_score.index, height=mutual_score["Mutual_Score"])
plt.xticks(rotation=90)
plt.show()


In [None]:
# configure to select all features
an = SelectKBest(score_func=f_classif, k='all')

# learn relationship from training data
an.fit(X_train, y_train)

# transform train input data
X_train_an = an.transform(X_train)

# transform test input data
X_test_an = an.transform(X_test)

#printing scores of the features
for i in range(len(an.scores_)):
    print('Feature %d: %f' % (i, mutual.scores_[i]))

In [None]:
# I create a df with the test scores
an_score = pd.DataFrame(an.scores_, index=df_imputed.drop(columns=["Potability"]).columns, columns=["Anova_Score"])
an_score = an_score.sort_values(by="Anova_Score", ascending=False)

# I visualise with a bar graph which features are most influential
sns.set()
plt.figure(figsize=(10, 4))
plt.bar(x=an_score.index, height=an_score["Anova_Score"])
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

models =[("LR", LogisticRegression(max_iter=1000, random_state=0)),('KNN',KNeighborsClassifier(n_neighbors=10)),
         ('RF',RandomForestClassifier(random_state=0)), ("SVC", SVC())]

results = []
names = []
finalResults = []

for name,model in models:
    model.fit(X_train, y_train)
    model_results = model.predict(X_test)
    score = accuracy_score(y_test, model_results)
    results.append(score)
    names.append(name)
    finalResults.append((name,score))
    
finalResults.sort(key=lambda k:k[1],reverse=True)
finalResults

In [None]:
from imblearn.combine import SMOTETomek 
from sklearn.model_selection import StratifiedKFold #stratification
from sklearn.model_selection import GridSearchCV #for validating hyperparameters
from imblearn.pipeline import Pipeline as imbpipeline
model_rf = RandomForestClassifier(random_state=0)
pipeline = imbpipeline(steps = [['smotetomek', SMOTETomek(random_state=0, n_jobs=-1)],
                                ['rf', model_rf]])

crossval = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)

#preparing parameter values to be validated    
rf_params = [{"rf__n_estimators": [10,100, 1000],"rf__max_features":['sqrt', 'log2'], "rf__criterion":['gini', 'entropy'], "rf__max_depth":[11,21,51]}]

rf_grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=rf_params,
                           scoring='accuracy',
                           cv=crossval, verbose=1,
                           n_jobs=-1)

rf_grid_search.fit(X_train, y_train)
print("Best score: {}, and best hyperparameters: {}".format(rf_grid_search.best_score_, rf_grid_search.best_params_ ))

In [None]:
#predictions and model accuracy
y_test_pred_rf = rf_grid_search.best_estimator_.predict(X_test)
y_train_pred_rf = rf_grid_search.best_estimator_.predict(X_train)


#accuracy
print('Model accuracy on test set: {}'.format(round(accuracy_score(y_test, y_test_pred_rf),2)))
print('Model accuracy on training set: {}'.format(round(accuracy_score(y_train, y_train_pred_rf),2)))

In [None]:
from sklearn.metrics import classification_report
target_names = ['Not Potable', 'Potable']
print(classification_report(y_test, y_test_pred_rf, target_names=target_names))

In [None]:
#creating list with the resuls of the model
tab = []
    
tab.append([rf_grid_search.estimator,accuracy_score(y_train, y_train_pred_rf), " --- ",
                               accuracy_score(y_test, y_test_pred_rf)])

#### SVC

In [None]:
model_svc = SVC(probability=True)
pipeline = imbpipeline(steps = [['smotetomek', SMOTETomek(random_state=0, n_jobs=-1)],
                                ['SVC', model_svc]])

crossval = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)
    
#preparing parameter values to be validated
svc_params = [
               {"SVC__kernel": ["linear"], "SVC__C": [ 0.1, 1, 10,]},
               {"SVC__kernel": ["rbf"], "SVC__C": [0.01, 0.1, 1, 10, 100], "SVC__gamma": [0.01, 0.1, 1, 10, 100]},
               {"SVC__kernel": ["poly"], "SVC__C": [0.01, 0.1, 1, 10], "SVC__degree": np.arange(1,5,1)}
              ]

svc_grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=svc_params,
                           scoring='accuracy',
                           cv=crossval, verbose=1,
                           n_jobs=-1)

svc_grid_search.fit(X_train, y_train)
print("Best score: {}, and best hyperparameters: {}".format(svc_grid_search.best_score_, svc_grid_search.best_params_ ))

In [None]:
#predictions and model accuracy
y_test_pred_svc = svc_grid_search.best_estimator_.predict(X_test)
y_train_pred_svc = svc_grid_search.best_estimator_.predict(X_train)


#accuracy
print('Model accuracy on test set: {}'.format(round(accuracy_score(y_test, y_test_pred_svc),2)))
print('Model accuracy on training set: {}'.format(round(accuracy_score(y_train, y_train_pred_svc),2)))

In [None]:
target_names = ['Not Potable', 'Potable']
print(classification_report(y_test, y_test_pred_svc, target_names=target_names))

In [None]:
#creating list with the resuls of the second model
tab.append([svc_grid_search.estimator,accuracy_score(y_train, y_train_pred_svc), " --- ",
                                accuracy_score(y_test, y_test_pred_svc)])

#creating dataframe in order to compare the models    
models_table = pd.DataFrame(tab, columns=["model", "accuracy_train", 
                                   " --- ", "accuracy_test",])
models_table

In [None]:
#predict probabilities
pred_prob1 = rf_grid_search.predict_proba(X_test)
pred_prob2 = svc_grid_search.predict_proba(X_test)

from sklearn.metrics import precision_recall_curve
#prcision-recall curve models
precision1, recall1, thresholds1 = precision_recall_curve(y_test, pred_prob1[:,1])
precision2, recall2, thresholds2 = precision_recall_curve(y_test, pred_prob2[:,1])

In [None]:
from sklearn import metrics
# calculate precision-recall AUC
auc_prc1 = metrics.auc(recall1, precision1)
auc_prc2 = metrics.auc(recall2, precision2)
print("RF AUC PRC score:{}, SVC AUC PRC score:{}".format(auc_prc1, auc_prc2))

In [None]:
#plot PRC curves
plt.plot(recall1, precision1, linestyle='--', color='orange', label='Random Forest')
plt.plot(recall2, precision2, linestyle='--', color='green', label='SVC')
plt.title('PRC curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')
plt.show()

In [None]:
#predict probabilities
pred_prob1 = rf_grid_search.predict_proba(X_test)
pred_prob2 = svc_grid_search.predict_proba(X_test)

from sklearn.metrics import roc_curve
#roc curve models
fpr1, tpr1, thresholds1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
fpr2, tpr2, thresholds2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)
#roc curve for tpr=fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test,random_probs, pos_label=1)

In [None]:
from sklearn.metrics import roc_auc_score
#auc scores
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])
print("RF AUC ROC score:{}, SVC AUC ROC score:{}".format(auc_score1, auc_score2))

In [None]:
#plot the ROC curves
plt.plot(fpr1, tpr1, linestyle='--', color='orange', label='Random Forest')
plt.plot(fpr2, tpr2, linestyle='--', color='green', label='SVC')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
plt.title('ROC curve')
plt.xlabel('False positive Rate')
plt.ylabel('True positive Rate')
plt.legend(loc='best')
plt.show()

In [None]:
#df 'reducted'
df_red = df_imputed[[ 'Conductivity','Sulfate', 
                 'Hardness', 'ph', 'Potability']]

#showing head
df_red.head()

In [None]:
#features
X = df_red.drop("Potability", axis=1)

#label
# y = df.iloc[:,-1]
y = df_red.loc[:, "Potability"]

In [None]:
#splitting
X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state= 0)
print(X_train.shape)
print(y_train.shape)

#normalizing features
scaler = StandardScaler() #calling the scaler 
#fitting the features to scale them
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models =[("LR", LogisticRegression(max_iter=1000, random_state=0)),('KNN',KNeighborsClassifier(n_neighbors=10)),
         ('RF',RandomForestClassifier(random_state=0)), ("SVC", SVC())]

results = []
names = []
finalResults = []

for name,model in models:
    model.fit(X_train, y_train)
    model_results = model.predict(X_test)
    score = accuracy_score(y_test, model_results)
    results.append(score)
    names.append(name)
    finalResults.append((name,score))
    
finalResults.sort(key=lambda k:k[1],reverse=True)
finalResults

In [None]:
model_rf = RandomForestClassifier(random_state=0)
pipeline = imbpipeline(steps = [['smotetomek', SMOTETomek(random_state=0, n_jobs=-1)],
                                ['rf', model_rf]])

crossval = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)

#preparing parameter values to be validated    
rf_params = [{"rf__n_estimators": [10,100, 1000],"rf__max_features":['sqrt', 'log2'], "rf__criterion":['gini', 'entropy'], "rf__max_depth":[11,21,51]}]

rf_grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=rf_params,
                           scoring='accuracy',
                           cv=crossval, verbose=1,
                           n_jobs=-1)

rf_grid_search.fit(X_train, y_train)
print("Best score: {}, and best hyperparameters: {}".format(rf_grid_search.best_score_, rf_grid_search.best_params_ ))

In [None]:
#predictions and model accuracy
y_test_pred_rf = rf_grid_search.best_estimator_.predict(X_test)
y_train_pred_rf = rf_grid_search.best_estimator_.predict(X_train)


#accuracy
print('Model accuracy on test set: {}'.format(round(accuracy_score(y_test, y_test_pred_rf),2)))
print('Model accuracy on training set: {}'.format(round(accuracy_score(y_train, y_train_pred_rf),2)))

In [None]:
target_names = ['Not Potable', 'Potable']
print(classification_report(y_test, y_test_pred_rf, target_names=target_names))

In [None]:
#creating list with the resuls of the model
tab = []
    
tab.append([rf_grid_search.estimator,accuracy_score(y_train, y_train_pred_rf), " --- ",
                               accuracy_score(y_test, y_test_pred_rf)])

In [None]:
model_svc = SVC(probability=True)
pipeline = imbpipeline(steps = [['smotetomek', SMOTETomek(random_state=0, n_jobs=-1)],
                                ['SVC', model_svc]])

crossval = StratifiedKFold(n_splits=8, shuffle=True, random_state=0)
    
#preparing parameter values to be validated
svc_params = [
               {"SVC__kernel": ["linear"], "SVC__C": [ 0.1, 1, 10,]},
               {"SVC__kernel": ["rbf"], "SVC__C": [0.01, 0.1, 1, 10, 100], "SVC__gamma": [0.01, 0.1, 1, 10, 100]},
               {"SVC__kernel": ["poly"], "SVC__C": [0.01, 0.1, 1, 10], "SVC__degree": np.arange(1,5,1)}
              ]

svc_grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=svc_params,
                           scoring='accuracy',
                           cv=crossval, verbose=1,
                           n_jobs=-1)

svc_grid_search.fit(X_train, y_train)
print("Best score: {}, and best hyperparameters: {}".format(svc_grid_search.best_score_, svc_grid_search.best_params_ ))

In [None]:
#predictions and model accuracy
y_test_pred_svc = svc_grid_search.best_estimator_.predict(X_test)
y_train_pred_svc = svc_grid_search.best_estimator_.predict(X_train)


#accuracy
print('Model accuracy on test set: {}'.format(round(accuracy_score(y_test, y_test_pred_svc),2)))
print('Model accuracy on training set: {}'.format(round(accuracy_score(y_train, y_train_pred_svc),2)))

In [None]:
target_names = ['Not Potable', 'Potable']
print(classification_report(y_test, y_test_pred_svc, target_names=target_names))

In [None]:
#creating list with the resuls of the second model
tab.append([svc_grid_search.estimator,accuracy_score(y_train, y_train_pred_svc), " --- ",
                                accuracy_score(y_test, y_test_pred_svc)])

#creating dataframe in order to compare the models    
models_table = pd.DataFrame(tab, columns=["model", "accuracy_train", 
                                   " --- ", "accuracy_test",])
models_table

In [None]:
#predict probabilities
pred_prob1 = rf_grid_search.predict_proba(X_test)
pred_prob2 = svc_grid_search.predict_proba(X_test)

#prcision-recall curve models
precision1, recall1, thresholds1 = precision_recall_curve(y_test, pred_prob1[:,1])
precision2, recall2, thresholds2 = precision_recall_curve(y_test, pred_prob2[:,1])

In [None]:
# calculate precision-recall AUC
auc_prc1 = metrics.auc(recall1, precision1)
auc_prc2 = metrics.auc(recall2, precision2)
print("RF AUC PRC score:{}, SVC AUC PRC score:{}".format(auc_prc1, auc_prc2))

In [None]:
#plot the ROC curves
#plot PRC curves
plt.plot(recall1, precision1, linestyle='--', color='orange', label='Random Forest')
plt.plot(recall2, precision2, linestyle='--', color='green', label='SVC')
plt.title('PRC curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')
plt.show()

In [None]:
#predict probabilities
pred_prob1 = rf_grid_search.predict_proba(X_test)
pred_prob2 = svc_grid_search.predict_proba(X_test)


#roc curve models
fpr1, tpr1, thresholds1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
fpr2, tpr2, thresholds2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)
#roc curve for tpr=fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test,random_probs, pos_label=1)

In [None]:
#auc scores
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])
print("RF AUC ROC score:{}, SVC AUC ROC score:{}".format(auc_score1, auc_score2))

In [None]:
#plot the ROC curves
plt.plot(fpr1, tpr1, linestyle='--', color='orange', label='Random Forest')
plt.plot(fpr2, tpr2, linestyle='--', color='green', label='SVC')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
plt.title('ROC curve')
plt.xlabel('False positive Rate')
plt.ylabel('True positive Rate')
plt.legend(loc='best')
plt.show()