In [1]:
# Data Processing
import pandas as pd
import numpy as np
 
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
 
# in Terminal do these installs
# pip install graphviz
# conda install graphviz

In [2]:
# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [3]:
cin = pd.read_csv('https://raw.githubusercontent.com/data-to-insight/D2I-Jupyter-Notebook-Tools/refs/heads/main/ml-data%20science%20tutorials/data/CINdetails.csv')
characteristics = pd.read_csv('https://raw.githubusercontent.com/data-to-insight/D2I-Jupyter-Notebook-Tools/refs/heads/main/ml-data%20science%20tutorials/data/ChildCharacteristics.csv')
identifiers = pd.read_csv('https://raw.githubusercontent.com/data-to-insight/D2I-Jupyter-Notebook-Tools/refs/heads/main/ml-data%20science%20tutorials/data/ChildIdentifiers.csv')

In [None]:
print(cin.info())
print(identifiers.info())
print(characteristics.info())

In [None]:
cin.sort_values('CINreferralDate', inplace=True, ascending=False)
cin.drop_duplicates('LAchildID', inplace=True, keep='first')
 
cin['CINplan'] = 1
 
cin

In [None]:
df = characteristics.merge(identifiers, how='left', on='LAchildID')
df = df.merge(cin, how='left', on='LAchildID')

df['Age'] = pd.to_datetime('today') - pd.to_datetime(df['PersonBirthDate'], format='%Y-%m-%d')
 
df['Age'] = df['Age']/pd.Timedelta(days=365)
 
cols_to_keep = ['Age', 'Ethnicity', 'GenderCurrent', 'CINplan']
 
df = df[cols_to_keep]
 
df

In [None]:
df['CINplan'] = df['CINplan'].fillna(0)
 
df

In [None]:
gender_dict = {1:'male',
               2:'female',
               0:'other',
               9:'other'}
 
df['GenderCurrent'] = df['GenderCurrent'].map(gender_dict)
 
df

In [None]:
def ethnic_check(row):
    letter = row[0]
    if letter == 'W':
        return 'white'
    if letter == 'B':
        return 'black'
    if (letter == 'A') | (letter == 'C'):
        return 'asian'
    if letter == 'M':
        return 'Mixed'
    else:
        return 'other'
 
 
df['Ethnicity'] = df['Ethnicity'].apply(ethnic_check)
 
df

In [None]:
df_encoded = pd.get_dummies(df[['GenderCurrent', 'Ethnicity']])
 
full_df = df[['Age', 'CINplan']].merge(df_encoded, how='left', left_index=True, right_index=True)
 
full_df

In [None]:
# split data into features and label
X = full_df.drop('CINplan', axis=1)
y = full_df['CINplan']
 
# split data into train/test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
 
# instantiate object for model
rf = RandomForestClassifier()
 
# fit model
rf.fit(X_train, y_train)
 
# make predictions
y_pred = rf.predict(X_test)
 
# check accuracy
print(accuracy_score(y_test, y_pred))

In [None]:
tree = rf.estimators_[0]
 
dot_data = export_graphviz(tree,
                           feature_names=X_train.columns,
                           filled=True,
                           max_depth=3)
graph = graphviz.Source(dot_data)
display(graph)

In [None]:
for i in range(3):
    tree = rf.estimators_[i]
 
    dot_data = export_graphviz(tree,
                            feature_names=X_train.columns,
                            filled=True,
                            max_depth=3)
    graph = graphviz.Source(dot_data)
    display(graph)

In [None]:
new_rf = RandomForestClassifier()
 
rand_search = RandomizedSearchCV(new_rf,
                                 param_distributions={'n_estimators':randint(50,500),
                                                      'max_depth':randint(1,20)},
                                                      n_iter=10)
 
rand_search.fit(X_train, y_train)
 
print(rand_search.best_params_)
 
best_rf = rand_search.best_estimator_

In [None]:
y_pred = best_rf.predict(X_test)
 
cm = confusion_matrix(y_test, y_pred)
 
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
feature_importance_df = pd.DataFrame({'feature importance': rf.feature_importances_,
                                      'feature':X_train.columns})\
                                      .set_index('feature')
 
feature_importance_df.plot.bar()