In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'fetal-health-classification:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F916586%2F1553068%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240816%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240816T034152Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D53e324b11b5b4d196006fab53130368f66c476b05ca6a3c100baf200662555e66718fc48369f1a6c1581387677eb4dfe3a06dc0d6d56a10053c9c7a7ec4ea774e95b375ec4e4b42602d6a043c16492a562f4e7fa0d0c97fc9c8122459ef8aa2ad6848609ca1399a7bef6db947d0668affb62661d8ebcda83b163c63c22ace7218da52f6c9a9650ec9e304a3063b42829ab69da1aca6d7644dfcb0370e85f1d338e7388e7ce0352d271dc27ea1d19e7556359f5481d434774196cbdd24839398dd036b14dbb6c59db625f5b6f7189de963ccb44d18a8e573efce09c2c6d07e70792bd8aabf97ca44d61b96e40e22ebf2304e300ea9479b7cedf23890788c31d17,fatalhealthanalysis/scikitlearn/default/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F79467%2F94771%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240816%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240816T034152Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1398e5e694d8ea09ed5c4bed4a6802d4392f72b4db32dd97fe0c82d70e3a4ed0891af4b1a90dc768dc6b15dcb90d18971ea142ebaeacb3db162aa75fc4eba1dcc8777bdf2801f10ac983337c30a04f5033fff00f66219fd3327adc171dde0b3d36d995fd5a161d87cf6ecfb3cf94d48d0162ee22bad03b2c8952b35bdca955eb14618afa572c94c724d6530fac3904e4b3147e81345177b29d09cefe11e54d037caa42fcca05a1c6cebe716d0fe8ca7cb33642ad6d05220e7515c949cb14ce0ecede8697d942747d0f472db01811ab2c6b45c77909351673cc0fe05d160bbe8f18be82cbb69a55659a136a7f1bb3ca36303c1708894bcdb3d519a2ee0dbc3ba7'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import hashlib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# accuracy
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold,GridSearchCV,cross_val_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')

In [None]:
df.head()

In [None]:
df[df.fetal_health==3]

In [None]:
df.describe().T

In [None]:
df.hist(bins=50,figsize=(25,15))
plt.show()

Dividing into train set and test test according to the way it'll be separated as discreate no matter how much re-run happen to this notebook

In [None]:
def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1]<256*test_ratio

In [None]:
def split_train_test_by_id(data,test_ratio,id_column,hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_,test_ratio,hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
df_with_idx = df.reset_index()

In [None]:
df_with_idx.head()

In [None]:
train_set,test_set = split_train_test_by_id(df_with_idx,0.2,'index')

In [None]:
train_set.shape,test_set.shape

In [None]:
train_set.head()

In [None]:
plt.figure(figsize=(10,4))
plt.hist(df['fetal_health'],bins=50,range=(0,10))
plt.show()

In [None]:
plt.figure(figsize=(10,4))
plt.hist(df['uterine_contractions'],bins=50)
plt.show()

In [None]:
df['uterine_cat'] = np.ceil(df['uterine_contractions']/1)
df['uterine_cat'] = df['uterine_cat'].where(df['uterine_contractions']<0.005,3)

In [None]:
df['uterine_cat'].value_counts()

In [None]:
df['uterine_cat'].value_counts()/len(df)

For even distribution I'm using StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

In [None]:
for train_idx, test_idx in split.split(df,df['uterine_cat']):
    strait_train_set = df.loc[train_idx]
    strait_test_set = df.loc[test_idx]

In [None]:
strait_train_set.head()

In [None]:
for set in (strait_train_set,strait_test_set):
    set.drop('uterine_cat',axis=1,inplace=True)

In [None]:
def plot(x: str ,y: str = None, data: pd.DataFrame = df, Plot=plt.scatter,alpha=None) -> None:
    ploty = df[x]
    plotx = df[y]
    Plot(plotx,ploty,alpha)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

In [None]:
df.columns

In [None]:
num_cols = list(df.select_dtypes(['int','float']).columns)
num_cols

In [None]:
plot('baseline value','accelerations',alpha=0.6)

In [None]:
plt.bar(df['fetal_health'],df['prolongued_decelerations'])
plt.show()

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix['fetal_health'].sort_values(ascending=False)

**Let's drop the columns which has the low correlation with the output**

In [None]:
corr_dict = dict(corr_matrix['fetal_health'])
corr_dict

In [None]:
drop_cols = [key for key,value in corr_dict.items()  if abs(value-0)<(0.05) ]

In [None]:
drop_cols

In [None]:
for set in (strait_train_set,strait_test_set):
    set.drop(drop_cols,axis=1,inplace=True)

In [None]:
strait_train_set.head(3)

In [None]:
plot('fetal_health','prolongued_decelerations')

In [None]:
plt.bar('abnormal_short_term_variability','fetal_health',data=df)
plt.show()

In [None]:
plt.bar('percentage_of_time_with_abnormal_long_term_variability','fetal_health',data=df)
plt.show()

In [None]:
df.isna().sum()

In [None]:
train_inputs = strait_train_set.drop('fetal_health', axis=1)
train_target = strait_train_set['fetal_health']

test_inputs = strait_test_set.drop('fetal_health', axis=1)
test_target = strait_test_set['fetal_health']

In [None]:
train_inputs.head()

In [None]:
imputer = SimpleImputer(strategy = 'mean')
scaler = MinMaxScaler()

In [None]:
prepared_train_inputs, prepared_test_inputs = imputer.fit_transform(train_inputs), imputer.fit_transform(test_inputs)
prepared_train_inputs, prepared_test_inputs = scaler.fit_transform(prepared_train_inputs), scaler.fit_transform(prepared_test_inputs)

### **Now let's label the train_inputs as [0,2] instead [1,3] for more convenience**

In [None]:
train_target = train_target.apply(lambda x: x-1)
test_target = test_target.apply(lambda x: x-1)

In [None]:
prepared_train_inputs[2]

### Model Selection (all machine learning / scikit-learn classifier)

In [None]:
# List of classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': XGBClassifier(objective='multi:softmax', num_class=3,n_jobs=-1),
    'AdaBoost': AdaBoostClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    'Neural Network': MLPClassifier(max_iter=500)
}

#### using cross validation for determining suitable model  

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
param_grids =  {
    'Logistic Regression' : {
        'multi_class' : ['multinomial','ovr'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg'],
        'penalty': ['l2'],
        'max_iter': [100, 200, 300]
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 10, 20, 50],
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_features': [None, 'sqrt', 'log2'],
        'min_samples_leaf': [1, 2, 4],
        'max_leaf_nodes': [None, 10, 20, 30]
    },

    'Gradient Boosting': {
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 5, 10, 20],
        'subsample': [0.8, 0.9, 1.0],
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.5, 1],
        'algorithm': ['SAMME', 'SAMME.R']
    },
    'Support Vector Machine': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [3, 4, 5],  # poly kern
        'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
        'class_weight': [None, 'balanced']
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski'],
        'p': [1, 2]  #  Minkowski metric (1=Manhattan, 2=Euclidean)
    },
    'Naive Bayes': {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}
}


In [None]:
for name,clf in classifiers.items():
    clf.fit(prepared_train_inputs,train_target)
    preds = clf.predict(prepared_train_inputs)
    print(f"Accuracy of {name}: {accuracy_score(train_target,preds)}")

#### let's tune the hyper parameters of Decision tree, random forest, Gradient boosting, Neural Networks, K Nearest Neighbors

In [None]:
best_classifiers ={}
for name,clf in classifiers.items():
    if name=='Random Forest' or name=='Neural Network':
        pass
    else:
        grid = GridSearchCV(estimator=clf,param_grid=param_grids[name], cv=cv,scoring='accuracy',n_jobs=-1)
        grid.fit(prepared_train_inputs,train_target)
        best_classifiers[name]=grid.best_estimator_
        print(f"best parameters for {name}: {grid.best_estimator_}")
        print(f'best cross-validation for {name}: {grid.best_score_:.2f}')


#### Random Forest take hella lot of time to let's compute it separately

In [None]:
best_classifiers['Decision Tree']

In [None]:
late_run_model = {'Random Forest': {
        'n_estimators': [200, 300, 500],
        'max_depth': [20,30],
        'criterion': ['entropy'],
        'max_features': [None, 'sqrt', 'log2'],
        'min_samples_split': [10,20],
        'min_samples_leaf': [2, 4],
        'bootstrap': [True, False]
    },
       'Neural Network': {
        'hidden_layer_sizes': [(100,), (50, 50), (100, 50, 25)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['adam'],
        'alpha': [0.01,0.1,1],  # Regularization term
        'learning_rate': ['constant', 'invscaling', 'adaptive'],
        'max_iter': [500]
    }
}

In [None]:
grid = GridSearchCV(estimator=classifiers['Random Forest'],param_grid=late_run_model['Random Forest'], cv=cv,scoring='accuracy',n_jobs=-1)
grid.fit(prepared_train_inputs, train_target)
best_classifiers['Random Forest'] = grid.best_estimator_
print(f'Best Estimator for Random Forest: {grid.best_params_}')
print(f'best cross validation accuracy for Random Forest: {grid.best_score_:.2f}')

In [None]:
scores = cross_val_score(classifiers['Random Forest'],prepared_train_inputs,train_target,cv=cv, scoring='accuracy')
print(f"Accuracy for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")


#### Let's build a multi-layer perceptron

In [None]:
# grid = GridSearchCV(estimator=classifiers['Neural Network'],param_grid=late_run_model['Neural Network'], cv=cv,scoring='accuracy',n_jobs=-1)
# grid.fit(prepared_train_inputs, train_target)
# best_classifiers['Neural Network'] = grid.best_estimator_
# print(f'Best Estimator for Neural Network: {grid.best_params_}')
# print(f'best cross validation accuracy for Neural Network: {grid.best_score_:.2f}')

In [None]:
mlp = MLPClassifier(activation='relu',alpha=0.01,hidden_layer_sizes=(100,50,25,10),learning_rate='adaptive',max_iter=500,solver='adam')
scores = cross_val_score(mlp,prepared_train_inputs,train_target,cv=cv, scoring='accuracy')
print(f"Accuracy for each fold: {scores}")
print(f"Mean Accuracy: {np.mean(scores)}")

In [None]:
best_model = best_classifiers['Gradient Boosting']

In [None]:
test_preds = best_model.predict(prepared_test_inputs)
print(f"accuracy of best model in test set : {accuracy_score(test_target,test_preds)}")

In [None]:
def new_input(data: dict):
    df = pd.DataFrame([data])
    corr_matrix = df.corr()
    corr_dict = dict(corr_matrix['fetal_health'])
    drop_cols = [key for key,value in corr_dict.items() if abs(value)<0.05]
    df.drop(drop_cols,axis=1,inplace=True)
    df_prepared = imputer.fit_transform(df)
    df_prepared = scaler.fit_transform(df_prepared)
    return best_model.predict(df_prepared)

### now we go!
#### **Note: First i test all model then remove the abundunt models which are not required or need soo much time to evaluate**