# Import

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#Ensemble classifications models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,confusion_matrix, classification_report, roc_auc_score,
f1_score, recall_score, precision_score)
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
from collections import Counter
from sklearn.ensemble import AdaBoostClassifier
import matplotlib.pyplot as plt

In [2]:
Modelname = 'RandomForestClassifier_model.sav'

# Functions

In [3]:
def baseline_models(data=[], verbose=False):
    #List of models to be used
    models=[DecisionTreeClassifier(),LogisticRegression(max_iter=100),
            RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=101),
            GradientBoostingClassifier(learning_rate=0.1,random_state=1,n_estimators=100,min_samples_split=5,min_samples_leaf=1,subsample=0.8,max_depth=3),
            AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=200,random_state=1,learning_rate=1)
]
    #Create training and testing data sets depending on wheather or not they have been generated previously.
    if len(data)== 2:
        X_train, X_test, y_train, y_test = train_test_split(data[0],data[1],random_state=123)
    else:
        X_train, X_test, y_train, y_test = data[0],data[1],data[2],data[3]
    #Instantiate lists to store each of the models results
    accuracy = []
    f1 = []
    auc = []
    recall = []
    precision = []
    #Run thorugh each of the models to get their performance metrics
    for model in models:
        clf = model
        clf.fit(X_train, y_train)
        test_preds = clf.predict(X_test)
        f1.append(f1_score(y_test, test_preds,average='macro'))
        accuracy.append(accuracy_score(y_test, test_preds))
        # auc.append(roc_auc_score(y_test, test_preds,multi_class="ovr" ))
        recall.append(recall_score(y_test, test_preds,average='macro'))
        precision.append(precision_score(y_test, test_preds,average='macro'))
        #Print the model and its report
        if verbose:
            print('Classification Model: ',model,'\n')
            print(classification_report(y_test, test_preds),'\n')
    #store results in dataframe
    results = pd.DataFrame([f1, accuracy, precision,recall],
                      index= ['f1','accuracy','precision','recall',],
                           columns=['DecisionTree','LogisticRegression','RandomForest','Gradient Boosting','AdaBoostClassifier'])
    #Change orientation of the dataframe
    return results.transpose()

# ML

In [4]:
path='./data/aneurysm_data_apriori.csv'

In [5]:
aneurysm_DataSet=pd.read_csv(path)

In [6]:
aneurysm_DataSet = aneurysm_DataSet.drop(columns=['Record ID', 'Status of aneurysm_Un-Ruptured', 'Status of aneurysm_Unknown'], axis=1)

#Moving Target to the final col
column_to_move = aneurysm_DataSet.pop("Status of aneurysm_Ruptured")
aneurysm_DataSet['Status of aneurysm_Ruptured'] = column_to_move
aneurysm_DataSet['Status of aneurysm_Ruptured'] = aneurysm_DataSet['Status of aneurysm_Ruptured'] .map({1:'yes', 0:'no'})

target = aneurysm_DataSet['Status of aneurysm_Ruptured']

print(f"Original class counts: {Counter(target)}")

Original class counts: Counter({'no': 1541, 'yes': 393})


In [7]:
target

0       no
1       no
2       no
3       no
4       no
        ..
1929    no
1930    no
1931    no
1932    no
1933    no
Name: Status of aneurysm_Ruptured, Length: 1934, dtype: object

In [8]:
# load the dataset
dataset =  aneurysm_DataSet
data = dataset.values
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print('X_train',X_train.shape,'X_test',X_test.shape)

print(f"Original class counts y_train: {Counter(y_train)}")
print(f"Original class counts y_test: {Counter(y_test)}")



X_train (1353, 52) X_test (581, 52)
Original class counts y_train: Counter({'no': 1072, 'yes': 281})
Original class counts y_test: Counter({'no': 469, 'yes': 112})


In [9]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)


undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)


In [10]:
y = np.array(y)
yes_indices = np.where(y == 'yes')[0]
no_indices = np.where(y == 'no')[0]

minority_class_count = min(len(yes_indices), len(no_indices))
undersampled_no_indices = np.random.choice(no_indices, minority_class_count, replace=False)
undersampled_indices = np.concatenate([yes_indices, undersampled_no_indices])

# Undersampled y
x_undersampled = X[undersampled_indices]
y_undersampled = y[undersampled_indices]


unique_y, counts_y = np.unique(y, return_counts=True)
print("Unique values in y_resampled:", unique_y)
print("Counts of unique values in y_resampled:", counts_y)

Unique values in y_resampled: ['no' 'yes']
Counts of unique values in y_resampled: [1541  393]


In [12]:

# one-hot encode input variables
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)


In [13]:
results = baseline_models(data=[X_train, X_test, y_train, y_test])
results.sort_values('f1',ascending=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,f1,accuracy,precision,recall
RandomForest,0.696764,0.824441,0.714802,0.683969
AdaBoostClassifier,0.694965,0.829604,0.7249,0.676972
LogisticRegression,0.690898,0.833046,0.733794,0.66891
Gradient Boosting,0.680013,0.822719,0.710778,0.662513
DecisionTree,0.616577,0.757315,0.614758,0.618603


# RandomForestClassifier

In [14]:
df_train = aneurysm_DataSet.drop("Status of aneurysm_Ruptured", axis=1)
train_labels = aneurysm_DataSet["Status of aneurysm_Ruptured"]

In [15]:
encoders = []
features = []
for i in range(df_train.shape[1]):
    encoder_i = OneHotEncoder(handle_unknown='ignore')
    i_column = df_train.columns[i]
    encoder_i.fit(df_train[i_column].values.reshape(-1, 1))
    encoded_feature = encoder_i.transform(df_train[i_column].values.reshape(-1, 1))
    encoded_feature = encoded_feature.toarray()
    features.append(encoded_feature)
    encoders.append(encoder_i)

def get_oh(df):
    features_test = []
    for i in range(88):
        encoder_i = encoders[i]
        i_column = aneurysm_DataSet.columns[i]
        encoded_test = encoder_i.transform(df[i_column].values.reshape(-1, 1))
        encoded_test = encoded_test.toarray()
        features_test.append(encoded_test)
    return  np.concatenate(features_test, axis=1)

train_features = np.concatenate(features, axis=1)


In [16]:
train_features.shape, train_labels.values.shape

((1934, 104), (1934,))

In [17]:
# separate into input and output columns
X = train_features
y = train_labels
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
# one-hot encode input variables
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [18]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
# save the model
pickle.dump(clf, open(path+Modelname, 'wb'))