In [12]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [13]:
#using synthetic minority oversampling technique to perform oversampling on data to address class imbalance 
#this is done to increase the representation of the minority class by generating synthetic samples

def dataprocessing(file):
    data = pd.read_csv(file)
    targets = data['Threat']
    attributes = data.drop('Threat', axis=1)

    smote = SMOTE()
    attributes, targets = smote.fit_resample(attributes, targets)

    train_attributes, test_attributes, train_targets, test_targets \
        = train_test_split(attributes, targets)

    return train_attributes, test_attributes, train_targets, test_targets
    

In [14]:
#to instintiate a decision tree classifier from scikit-learn
#Trains the classifier using the training data (features and labels)
def modeltraining(train_attributes, train_targets):
    classify = DecisionTreeClassifier()
    classify.fit(train_attributes, train_targets)
    return classify

In [15]:
#Using the trained classifier to predict labels for the test data.
#generating a classification report using the classification_report function from scikit-learn
#comparing the predicted labels against the actual labels
def modeltesting(test_attributes, test_targets, classify):
    predictions = classify.predict(test_attributes)
    report = classification_report(test_targets, predictions)
    return report

In [16]:
#it trains a decision tree classifier using the preprocessed training data
if __name__ == "__main__":
    train_x, test_x, train_y, test_y = dataprocessing("/Users/rohanshenoy/Downloads/threats.csv")
    #generating classification report
    classification_model = modeltraining(train_x, train_y)
    #Tests the trained classifier using the preprocessed test data
    rep = modeltesting(test_x, test_y, classification_model)
    #prints the classification report
    print(rep)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    249544
           1       0.99      1.00      0.99    249502

    accuracy                           0.99    499046
   macro avg       0.99      0.99      0.99    499046
weighted avg       0.99      0.99      0.99    499046

