#### Importing important libraries:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Loading the dataset:

In [32]:
df = pd.read_csv("/content/drive/MyDrive/C Section Independent/Caesarian Section Classification Dataset(CSV).csv")
#df = pd.read_csv('Cesarean .csv')
df.head()

Unnamed: 0,Age,Delivey No,Delivery No,Blood of Pressure,Heart Problem,Caesarian
0,22,1,Timely,High,apt,No
1,26,2,Timely,Normal,apt,Yes
2,26,2,Premature,Normal,apt,No
3,28,1,Timely,High,apt,No
4,22,2,Timely,Normal,apt,Yes


In [33]:
# To know number od rows and column
df.shape

(80, 6)

In [34]:
# To know if missing value is preset and also to know dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                80 non-null     int64 
 1   Delivey No         80 non-null     int64 
 2   Delivery No        80 non-null     object
 3   Blood of Pressure  80 non-null     object
 4   Heart Problem      80 non-null     object
 5   Caesarian          80 non-null     object
dtypes: int64(2), object(4)
memory usage: 3.9+ KB


In [35]:
# Five point summary
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,80.0,27.6875,5.017927,17.0,25.0,27.0,32.0,40.0
Delivey No,80.0,1.6625,0.794662,1.0,1.0,1.0,2.0,4.0


In [36]:
#To find no. of unique values in categorical columns

for col in df.select_dtypes(include=object).columns:
    print('No. of unique values in column '+col+':')
    print(df[col].value_counts(),'\n')

No. of unique values in column Delivery No:
Timely       46
Premature    17
Latecomer    17
Name: Delivery No, dtype: int64 

No. of unique values in column Blood of Pressure:
Normal    40
High      20
Low       19
low        1
Name: Blood of Pressure, dtype: int64 

No. of unique values in column Heart Problem:
apt      50
inept    30
Name: Heart Problem, dtype: int64 

No. of unique values in column Caesarian:
Yes    42
No     34
yes     4
Name: Caesarian, dtype: int64 



- We can see that in column 'Blood of Pressure' there are 4 unique instead of 3
- In column Caesarian there should be 2 instead of 3

In [37]:
# Replacing 'low' to 'Low' and 'yes' to 'Yes'

df['Blood of Pressure'] = df['Blood of Pressure'].replace('low','Low')
df['Caesarian'] = df['Caesarian'].replace('yes','Yes')

In [38]:
# Encoding using One-Hot encoding

df_dummy = pd.get_dummies(df,drop_first=True)
df_dummy.head()

Unnamed: 0,Age,Delivey No,Delivery No_Premature,Delivery No_Timely,Blood of Pressure_Low,Blood of Pressure_Normal,Heart Problem_inept,Caesarian_Yes
0,22,1,0,1,0,0,0,0
1,26,2,0,1,0,1,0,1
2,26,2,1,0,0,1,0,0
3,28,1,0,1,0,0,0,0
4,22,2,0,1,0,1,0,1


#### Splitting into train and test:

In [39]:
#Separting df_train in independent and dependent variable
X=df_dummy.drop(['Caesarian_Yes'],axis=1)
y=df_dummy['Caesarian_Yes']

In [40]:
from collections import Counter
from imblearn.over_sampling import SMOTE
counter = Counter(y)
print('Before', counter)
smt = SMOTE()
X, y = smt.fit_resample(X, y)
counter = Counter(y)
print('After', counter)

Before Counter({1: 46, 0: 34})
After Counter({0: 46, 1: 46})


In [66]:
#Splitting df_train in train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#### Modelling:

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

logreg = LogisticRegression(solver='liblinear', fit_intercept=True)

logreg.fit(X_train, y_train)

y_prob_train = logreg.predict_proba(X_train)[:,1]
y_pred_train = logreg.predict (X_train)

print('Confusion Matrix - Train: ', '\n', confusion_matrix(y_train, y_pred_train))
print('Overall accuracy - Train: ', accuracy_score(y_train, y_pred_train))


y_prob = logreg.predict_proba(X_test)[:,1]
y_pred = logreg.predict (X_test)

print('Confusion Matrix - Test: ','\n', confusion_matrix(y_test, y_pred))
print('Overall accuracy - Test: ', accuracy_score(y_test, y_pred))

Confusion Matrix - Train:  
 [[16 13]
 [ 9 26]]
Overall accuracy - Train:  0.65625
Confusion Matrix - Test:  
 [[13  4]
 [ 0 11]]
Overall accuracy - Test:  0.8571428571428571


In [68]:
#Fitting whole dataset
log_r = logreg.fit(X, y)

#### Saving model in pickle:

In [69]:
#make a prediction with a stacking ensemble
from sklearn.datasets import make_classification
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn import ensemble


# define the base models
level0 = list()
level0.append(('lr', LogisticRegression()))
#level0.append(('knn', KNeighborsClassifier()))
#level0.append(('cart', DecisionTreeClassifier()))
#level0.append(('CB',CatBoostClassifier(iterations=500)))

level0.append(('xgb',xgb.XGBClassifier()))
level0.append(('rf', ensemble.RandomForestClassifier()))
#level0.append(('svm', SVC()))


# define meta learner model
#level1 = LogisticRegression()
level1 = xgb.XGBClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10)

# fit the model on all available data
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print('Testing',ac)
cm = confusion_matrix(y_test, y_pred)
cm

Testing 0.6071428571428571


array([[9, 8],
       [3, 8]])

In [70]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[9 8]
 [3 8]]
              precision    recall  f1-score   support

           0       0.75      0.53      0.62        17
           1       0.50      0.73      0.59        11

    accuracy                           0.61        28
   macro avg       0.62      0.63      0.61        28
weighted avg       0.65      0.61      0.61        28

