In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('final.csv')
df.shape

(3578, 15)

In [3]:
df.sample(5)

Unnamed: 0,Age,Height,Weight,BMI,Blood Group ( is Negative?),Delivery Type,Systolic Blood Pressure,Diastolic Blood Pressure,Edema,Anemia,Jaundice,Diabetes,Risk,Para,Gravida
1690,17,1.524,58.3,25.1,No,No_delivary,120,70,Normal,Low,No,No,Yes,0,1
3045,19,1.524,48.0,20.67,No,No_delivary,100,80,Normal,Low,No,No,No,0,1
1733,24,1.6,64.0,24.99,No,No_delivary,100,60,Normal,Normal,No,No,Yes,0,1
1021,25,1.549,66.0,27.51,No,Normal,110,80,Normal,Normal,No,No,No,1,3
1954,18,1.625,64.0,24.22,No,No_delivary,90,90,Normal,Normal,No,No,Yes,0,1


In [8]:
df1 = df.copy()
df1.head(5)

Unnamed: 0,Age,Height,Weight,BMI,Blood Group ( is Negative?),Delivery Type,Systolic Blood Pressure,Diastolic Blood Pressure,Edema,Anemia,Jaundice,Diabetes,Risk,Para,Gravida
0,26,1.346,95.0,52.42,No,Normal,120,80,Normal,Normal,No,No,Yes,1,2
1,26,1.346,95.0,52.42,No,Normal,140,60,Normal,Normal,No,No,Yes,1,2
2,18,1.244,78.0,50.4,No,Normal,120,70,Normal,Normal,No,No,Yes,1,3
3,26,1.27,77.0,47.74,No,Normal,100,60,Normal,Normal,No,No,Yes,1,2
4,26,1.27,77.0,47.74,No,Normal,110,80,Normal,Normal,No,No,Yes,1,2


# Data Analysis

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
columns = ['Age', 'Height', 'Blood Group ( is Negative?)',
       'Delivery Type', 'Systolic Blood Pressure', 'Diastolic Blood Pressure',
       'Edema', 'Anemia', 'Jaundice', 'Diabetes','Para','Gravida']
columns

['Age',
 'Height',
 'Blood Group ( is Negative?)',
 'Delivery Type',
 'Systolic Blood Pressure',
 'Diastolic Blood Pressure',
 'Edema',
 'Anemia',
 'Jaundice',
 'Diabetes',
 'Para',
 'Gravida']

In [7]:
df1['Risk'].value_counts()

NameError: name 'df1' is not defined

In [None]:
plt.figure(figsize=(10,5))
plt.pie(df1['Risk'].value_counts(), labels=['No Risk','Risk'], autopct = "%0.2f",explode = [0.1,0],colors = ['#c9ee82','#8293ee'])
plt.legend(title = "Pragnency Risk Prediction")
plt.show()

In [None]:
# for x in columns:
#     plt.figure(figsize=(15,5))
#     plt.title(x)
#     sns.countplot(data=df1, x=x, hue="Risk")
# plt.tight_layout()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Co-relation between independent variable")
sns.heatmap(df1.corr(),annot=True)
plt.show()

# Data Preprocessing

In [None]:
df1.dtypes

In [None]:
df1.isna().sum()

In [None]:
# df1.loc[df1.Height.isna()]

In [None]:
height_nan_index = df1['Height'].index[df1['Height'].apply(np.isnan)]
height_nan_index

In [None]:
df1.Height = df1.Height.fillna(df1.Height.mean())

In [None]:
df1.Height.isna().sum()

In [None]:
for i in height_nan_index:
    df1.BMI[i] = (round((float(df1.Weight[i]) / df1.Height[i] ** 2), 2))

In [None]:
df1.isna().sum()

In [None]:
bmi_nan_index = df1['BMI'].index[df1['BMI'].apply(np.isnan)]
bmi_nan_index

In [None]:
for i in bmi_nan_index:
    df1.BMI[i] = (round((float(df1.Weight[i]) / df1.Height[i] ** 2), 2))

In [None]:
df1.isna().sum()

In [None]:
df1['Systolic Blood Pressure'] = df1['Systolic Blood Pressure'].fillna(120)

In [None]:
df1['Diastolic Blood Pressure'] = df1['Diastolic Blood Pressure'].fillna(80)

In [None]:
df1.isna().sum()

# Model Building

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
col = ['Blood Group ( is Negative?)','Jaundice','Diabetes','Risk']
for x in col:
    df1[x] = encoder.fit_transform(df[x])
df1.head(5)

In [None]:
df2 = pd.get_dummies(df1,drop_first=True)
df2.sample(2)

In [None]:
X = df2.drop(['Risk','Height','Weight'],axis=1)

In [None]:
y = df2.Risk

In [None]:
from sklearn.svm import SVC

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(SVC(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))

In [None]:
from sklearn.ensemble import RandomForestClassifier

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(RandomForestClassifier(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))

In [None]:
from sklearn.tree import DecisionTreeClassifier

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(DecisionTreeClassifier(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))

In [None]:
from sklearn.linear_model import LogisticRegression

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(LogisticRegression(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(KNeighborsClassifier(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))

In [None]:
from sklearn.naive_bayes import GaussianNB

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(GaussianNB(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,f1_score,recall_score
import seaborn as sns

originalclass = []
predictedclass = []

def classification_report_with_accuracy_score(y_true, y_pred):
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return accuracy_score(y_true, y_pred) # return accuracy score


nested_score = cross_val_score(MultinomialNB(),X,y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))

print('cross_val_score : ',round(nested_score.mean(),2))
print('\nClassification Report : ')
print(classification_report(originalclass, predictedclass))