In [193]:
# Import Dependencies
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [194]:
# Create Random Data and place into Df
# Create Random Data
col_1 = np.random.randint(0, 10001, (501,1), dtype='int64')
col_2 = np.random.rand(501,1)*1000
col_3= np.random.rand(501,1)
col_4= np.random.rand(501,1)*50000
col_5 = np.random.randint(0, 86, (501,1), dtype='int64')
col_6 = np.random.randint(0, 2, (501,1), dtype='int64')
data = [col_1, col_2, col_3, col_4, col_5, col_6]
columns = ['Column 1', 'Column 2', 'Column 3', 'Column 4', 'Column 5', 'Column 6']
# Create Random Data df
mockdata_df=pd.DataFrame(index=list(range(0,501)))
for i in list(range(0,6)):
    mockdata_df[columns[i]]=data[i]
mockdata_df.head()


Unnamed: 0,Column 1,Column 2,Column 3,Column 4,Column 5,Column 6
0,3681,527.08912,0.019522,6426.184327,28,1
1,6236,194.673393,0.857566,31404.39935,5,0
2,6803,836.749017,0.654375,38310.547838,4,0
3,7118,214.393582,0.48506,4868.538434,20,0
4,4734,382.683636,0.619489,13044.214587,6,1


In [195]:
# define features and target
X=mockdata_df.copy()
X=X.drop('Column 6', axis=1,)
y=mockdata_df['Column 6']


In [196]:
# Create Training and Testing Data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

In [197]:
# Scale the Data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [198]:
# Create Random Forest Instance
rf_model=RandomForestClassifier(n_estimators=500, random_state=5)

In [199]:
# Fit the Model with training sets
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=500, random_state=5)

In [200]:
# Make Predictions
predictions = rf_model.predict(X_test_scaled)

In [201]:
# Assess the model 
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
acc_score = accuracy_score(y_test, predictions)
# Display the Results
print('Confusion Matrix')
display(cm_df)
print(f'Accuracy Score: {acc_score}')
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,36,24
Actual 1,36,30


Accuracy Score: 0.5238095238095238
              precision    recall  f1-score   support

           0       0.50      0.60      0.55        60
           1       0.56      0.45      0.50        66

    accuracy                           0.52       126
   macro avg       0.53      0.53      0.52       126
weighted avg       0.53      0.52      0.52       126



In [202]:
# Determine Feature Importances and sort in descending order
sorted(zip(rf_model.feature_importances_,X.columns), reverse=True)

[(0.21231240485339692, 'Column 4'),
 (0.20836989110294968, 'Column 3'),
 (0.20297616726084747, 'Column 1'),
 (0.19677175640633182, 'Column 2'),
 (0.1795697803764742, 'Column 5')]

In [203]:
# Create new data for the model to make predictions with 
# Create Random Data and place into Df
# Create Random Data
col_1 = np.random.randint(0, 10001, (501,1), dtype='int64')
col_2 = np.random.rand(501,1)*1000
col_3= np.random.rand(501,1)
col_4= np.random.rand(501,1)*50000
col_5 = np.random.randint(0, 86, (501,1), dtype='int64')
data = [col_1, col_2, col_3, col_4, col_5]
columns = ['Column 1', 'Column 2', 'Column 3', 'Column 4', 'Column 5']
# Create Random Data df
newdata_df=pd.DataFrame(index=list(range(0,501)))
for i in list(range(0,5)):
    newdata_df[columns[i]]=data[i]
newdata_df.head()

Unnamed: 0,Column 1,Column 2,Column 3,Column 4,Column 5
0,7538,166.266051,0.230211,33371.82571,43
1,2007,323.146614,0.772611,10529.291222,48
2,2364,348.914007,0.225228,39763.582434,21
3,4525,255.428136,0.483272,3689.768227,8
4,9488,850.711051,0.913506,8742.182866,40


In [204]:
# Use the model to make predictions about the new data
new_predictions = rf_model.predict(newdata_df)
# add the new predictions to the new dataframe
newdata_df['Predictions']=new_predictions
newdata_df.head()

Unnamed: 0,Column 1,Column 2,Column 3,Column 4,Column 5,Predictions
0,7538,166.266051,0.230211,33371.82571,43,0
1,2007,323.146614,0.772611,10529.291222,48,0
2,2364,348.914007,0.225228,39763.582434,21,0
3,4525,255.428136,0.483272,3689.768227,8,0
4,9488,850.711051,0.913506,8742.182866,40,0


In [205]:
# Create a Function to change encoded labels to informative labels
def code_to_string(code):
    if code == 1:
        code = 'Treatment'
    else:
        code = 'No Treatment'
    return code

In [206]:
# apply function to predictions column
newdata_df['Predictions'] = newdata_df['Predictions'].apply(code_to_string)
newdata_df.head()

Unnamed: 0,Column 1,Column 2,Column 3,Column 4,Column 5,Predictions
0,7538,166.266051,0.230211,33371.82571,43,No Treatment
1,2007,323.146614,0.772611,10529.291222,48,No Treatment
2,2364,348.914007,0.225228,39763.582434,21,No Treatment
3,4525,255.428136,0.483272,3689.768227,8,No Treatment
4,9488,850.711051,0.913506,8742.182866,40,No Treatment


In [207]:
# Count the Predictions
newdata_df['Predictions'].value_counts()


No Treatment    498
Treatment         3
Name: Predictions, dtype: int64