In [1]:
#Based on tutorial: https://machinelearningmastery.com/random-forest-ensemble-in-python/
#Run this code before you can classify

# Use numpy to convert to arrays
import numpy as np
from numpy import mean, std

# Pandas is used for data manipulation
import pandas as pd

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

def buildModel(features, labelDimension) :
    # Labels are the values we want to predict
    labels = np.array(features[labelDimension])
    # Remove the labels from the features
    # axis 1 refers to the columns
    features= features.drop(labelDimension, axis = 1)

    # Convert to numpy array
    features = np.array(features)

    # Split the data into training and testing sets (heavily overfit on provided dataset to get as close as possible to the original model)
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.30)

    print('Training Features Shape:', train_features.shape)
    print('Training Labels Shape:', train_labels.shape)
    print('Testing Features Shape:', test_features.shape)
    print('Testing Labels Shape:', test_labels.shape)

    # Instantiate model with 1000 decision trees
    rf = RandomForestClassifier(n_estimators = 1500)
    # Train the model on training data
    rf.fit(train_features, train_labels)

    #evaluate the model
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)
    n_scores = cross_val_score(rf, features, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

    print("done!")
    print("evaluating:")

    # report performance
    print(n_scores)
    print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

    return rf

In [3]:
#load in the dataset
features = pd.read_csv('F:\\study\\visual analytics\\4. Explaining HELOC\\data\\heloc_dataset_v1.csv')

#the columns that stores the labels
labelDimension = "RiskPerformance"

#build a random forest classifier
model = buildModel(features, labelDimension)

Training Features Shape: (7321, 23)
Training Labels Shape: (7321,)
Testing Features Shape: (3138, 23)
Testing Labels Shape: (3138,)
done!
evaluating:
[0.73900574 0.71510516 0.71797323 0.70076482 0.72179732 0.73231358
 0.74665392 0.70650096 0.71988528 0.71961722]
Accuracy: 0.722 (0.013)


In [4]:
#get the first datarow of the dataset
row = features.loc[0,:]

#remove the label column (first column)
instance = row[1:len(row)]

# Use the forest's predict method on the test data
prediction = model.predict(instance.to_numpy().reshape(1,-1))

#print prediction
print(prediction)

['Bad']
