> Advanced model

In [None]:
# Import statements

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
%matplotlib inline

### Data Loading

In [None]:
data = pd.read_csv()
data.head()

### Feature Engineering

In [None]:
# separate x and y data

feature_columns = data.columns[:-1]

x_data = data[feature_columns]
y_data = data['PCOS_Diagnosis'] 

In [None]:
x_data.head()

### Label Encoding

Note: Feature scaling and one-hot encoding are generally not required for tree-based algorithms.

- For categorical columns, label encoding (assigning integers to categories) is typically sufficient for tree-based models like Decision Trees and Random Forests. One-hot encoding can lead to unnecessary splits and higher dimensionality, which may reduce model efficiency.
- Feature scaling (e.g., normalization or standardization) is not needed because tree-based models are not sensitive to feature magnitude.

### Datasets Splitting

In [None]:
strat_shuf_split = StratifiedShuffleSplit(n_splits = 1, 
                                            test_size = 0.3, 
                                            random_state = 42)

train_idx, test_idx = next(strat_shuf_split.split(x_data, y_data))

# Create train and test dataframes

X_train = x_data.loc[train_idx]
y_train = y_data[train_idx]

X_test = x_data.loc[test_idx]
y_test = y_data[test_idx]

### Model training

In [None]:
# suppress warnings about too few trees form the early models

import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

In [None]:
# initialize the random forest estimator
# Note that the number of trees is not setup here
# In scikit-learn, warm_start is a parameter used in ensemble models like RandomForestClassifier to reuse the results from a previous .fit() call. Instead of starting from scratch, it allows the model to incrementally add more estimators (like decision trees) to the existing ensemble.


RF = RandomForestClassifier(oob_score = True,
                            random_state = 42,
                            warm_start = True,
                            n_jobs = -1)


oob_list = list()  # oob - out-of-bag score

# Iterate through all of the possibilities for number of trees

for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400]:

    # Use this to set number of trees
    RF.set_params(n_estimators = n_trees)

    RF.fit(X_train, y_train)

    oob_error = 1 - RF.oob_score_

    oob_list.append(pd.Series({'n_trees': n_trees, 'oob': oob_error}))

rf_oob_df = pd.concat(oob_list, axis =1).T.set_index('n_trees')
rf_oob_df

In [None]:
sns.set_context('talk')
sns.set_style('white')

ax = rf_oob_df.plot(legend = False, marker = 'o', figsize = (14,7), linewidth =5)
ax.set(ylabel = 'out-of-bag error')

### Hyper-parameter Tuning & Cross-validation