In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

### Generating dataset for our MODEL

In [13]:
# Define random seed (reproducibility)
np.random.seed(3025)

# Define data size
sample_size = 600

# Set column elements, a and b
a = np.random.normal(loc=8, scale=2, size=sample_size)
b = np.random.normal(loc=2, scale=1, size=sample_size)

# Create a ondition for our target variable
output = (a + b ) > 11

# create dataframe with the elements
df = pd.DataFrame({'bonus_july': a, 'bonus_august': b, 'result': output})
df.head(2)

Unnamed: 0,bonus_july,bonus_august,result
0,6.531691,1.427976,False
1,7.528378,1.821702,False


In [7]:
df.info

<bound method DataFrame.info of      bonus_june  bonus_August  result
0      6.531691      1.427976   False
1      7.528378      1.821702   False
2     14.524738      3.044921    True
3      8.459988      2.399132   False
4      7.452589      2.529509   False
..          ...           ...     ...
595   11.108997      1.993962    True
596   10.927098      1.116994    True
597   14.189641      2.244087    True
598    7.778402      3.035820   False
599    8.969353      3.500759    True

[600 rows x 3 columns]>

### RANDOM FOREST: now SPLIT data into 2, test and train. CREATE Classifier Object with HyperParameters

In [23]:
# Split data
X = df[['bonus_july', 'bonus_august']]
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3025)

In [26]:
# Classifier
r_foresifier = RandomForestClassifier(
    # Number of decision trees
    n_estimators = 60,
    
    # Maximum depth of the trees 
    max_depth = None, 
    
    # Minimum number of samples required to split an internal node
    min_samples_split = 2, 
    
    # Minimum number of samples required to be at a leaf node
    min_samples_leaf = 1,
    
    # Random seed for reproducibility
    random_state = 3025        
)

In [27]:
# Train Classifier with the training data
r_foresifier.fit(x_train, y_train)

In [28]:
# Prediction based on the test data
rf_predict = r_foresifier.predict(X_test)

In [31]:
# Evaluate model's accuracy on other metrics
m_accuracy = accuracy_score(y_test, rf_predict)

# Classification Report
c_report = classification_report(y_test, rf_predict)

In [45]:
# Print result
print("Random Forest Model Accuracy: ",)
print(m_accuracy)
print("Classificiation Report: ",) 
print(c_report)

Random Forest Model Accuracy: 
0.9833333333333333
Classificiation Report: 
              precision    recall  f1-score   support

       False       0.98      1.00      0.99        81
        True       1.00      0.95      0.97        39

    accuracy                           0.98       120
   macro avg       0.99      0.97      0.98       120
weighted avg       0.98      0.98      0.98       120



##### Splitted the dataset into training and testing sets, created a Random Forest Classifier with 60 decision trees and trained it on the training data. The model achieved an accuracy of 0.983 on the test data.
##### The classification reports a comprehensive evaluation of the model's performance such as precision, recall, and F1-score for each class (False and True), including their average.