(Random Forest, Decision Tree, SVM, Naive Bayes, AdaBoost, Regression, XGBoost) 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import DataConversionWarning


warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
dataset = pd.read_csv('final_features.csv')


In [3]:
# pd.set_option('display.max_columns', None)
dataset.shape

(1038, 66647)

In [4]:
# spliting data for Train and test
input_dataset = dataset.drop(['app_name', 'class'] , axis=1)  # Features
output_dataset = dataset[['class']]              # Target


X_train, X_test, y_train, y_test = train_test_split(input_dataset, output_dataset, test_size=0.25, random_state=42) # Test size 25%


In [5]:
# Now traning the models

models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(probability=True),
    "Naive Bayes": GaussianNB(),
    "AdaBoost": AdaBoostClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": xgb.XGBClassifier(eval_metric='mlogloss')
}


In [6]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Confusion Matrix": cm
    })



In [7]:
# Create a results table with Confusion Matrix as a string
results_df = pd.DataFrame(results)
results_df['Confusion Matrix'] = results_df['Confusion Matrix'].apply(lambda x: np.array2string(x))
results_df = results_df[['Model', 'Accuracy', 'Precision', 'Confusion Matrix']]
results_df

Unnamed: 0,Model,Accuracy,Precision,Confusion Matrix
0,Random Forest,0.961538,0.964209,[[116 10]\n [ 0 134]]
1,Decision Tree,0.984615,0.985061,[[122 4]\n [ 0 134]]
2,SVM,0.961538,0.962473,[[118 8]\n [ 2 132]]
3,Naive Bayes,0.961538,0.96163,[[120 6]\n [ 4 130]]
4,AdaBoost,1.0,1.0,[[126 0]\n [ 0 134]]
5,Logistic Regression,0.988462,0.988714,[[123 3]\n [ 0 134]]
6,XGBoost,0.988462,0.988488,[[124 2]\n [ 1 133]]
