# Drug Use Prediction

## Introduction

In this project, we explore the classification of drug consumption patterns using machine learning algorithms, specifically focusing on decision trees and random forests. The dataset utilized in this analysis originates from the UCI Machine Learning Repository and contains various features related to drug consumption behaviors.

The primary goal of this project is to predict whether individuals consume specific drugs (e.g., chocolate and mushrooms) based on a set of demographic and behavioral features. We will first apply a decision tree classifier to visualize the decision-making process and evaluate its accuracy. Subsequently, we will enhance our model using a random forest classifier, which aggregates multiple decision trees to improve classification performance.

This analysis aims to demonstrate the effectiveness of these machine learning techniques in handling classification tasks and to provide insights into the underlying patterns of drug consumption behaviors.

To start the vritual enviornment, runsource sklearn-env/bin/activate

## Libraries Installed

In [None]:
import ssl
import pandas as pd
from ucimlrepo import fetch_ucirepo 
import sys
print(sys.executable)
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

## Data Preprocessing

In [116]:
# Disable SSL verification
ssl._create_default_https_context = ssl._create_unverified_context

# Fetch dataset
drug_consumption_quantified = fetch_ucirepo(id=373)

# Data as pandas DataFrames
X = drug_consumption_quantified.data.features
y = drug_consumption_quantified.data.targets

# Convert features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

user_data = df.iloc[:, 0:12]


choco = df['choc']
shrooms = df['mushrooms']


In [None]:
# take the multiple classifications and convert it into binary

choc_binary = []
mushrooms_binary = []

for i in choco:
    if i == 'CL0' or i == 'CL1':
        choc_binary.append(0)
    else:
        choc_binary.append(1)

for i in shrooms:
    if i == 'CL0' or i == 'CL1':
        mushrooms_binary.append(0)
    else:
        mushrooms_binary.append(1)

choc_binary = np.array(choc_binary)
mushrooms_binary = np.array(mushrooms_binary)
# print("X:")
# print(user_data.shape)
# print("Y:")
# print(choc_binary.shape)

X_choco_train, X_choco_test, y_choco_train, y_choco_test = train_test_split(user_data, choc_binary, test_size=0.33, random_state=42)
X_mushroom_train, X_mushroom_test, y_mushroom_train, y_mushroom_test = train_test_split(user_data, mushrooms_binary, test_size=0.33, random_state=42)

# print("X choco train")
# print(X_choco_train.shape)
# print("Y choco train")
# print(y_choco_train.shape)
# print("X choco test")
# print(X_choco_test.shape)
# print("Y choco test")
# print(y_choco_test.shape)

## Model Training and Fitting for Chocolate



In [None]:
# Single Decision Tree
choco_tree = tree.DecisionTreeClassifier(random_state=42)
choco_tree.fit(X_choco_train, y_choco_train)
y_tree_choco_pred = choco_tree.predict(X_choco_test)

cm = confusion_matrix(y_choco_test, y_tree_choco_pred, labels=choco_tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=choco_tree.classes_)
disp.plot()
plt.show()

# Random Forest Learner
choco_random_forest = RandomForestClassifier(random_state=42)
choco_random_forest.fit(X_choco_train, y_choco_train)
y_random_forest_choco_pred = choco_random_forest.predict(X_choco_test)

cm = confusion_matrix(y_choco_test, y_random_forest_choco_pred, labels=choco_random_forest.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=choco_random_forest.classes_)
disp.plot()
plt.show()

# SVM (Support Vector Machine)
choco_svm = svm.SVC(random_state=42, probability=True)
choco_svm.fit(X_choco_train, y_choco_train)
y_svm_choco_pred = choco_svm.predict(X_choco_test)

cm = confusion_matrix(y_choco_test, y_svm_choco_pred, labels=choco_svm.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=choco_svm.classes_)
disp.plot()
plt.show()

# Gradient Boosting Ensemble
choco_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
choco_gb.fit(X_choco_train, y_choco_train)
y_gb_choco_pred = choco_gb.predict(X_choco_test)

cm = confusion_matrix(y_choco_test, y_gb_choco_pred, labels=choco_gb.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=choco_gb.classes_)
disp.plot()
plt.show()

# Multi-Layer Perceptron
choco_mlp = MLPClassifier(random_state=1, max_iter=500)
choco_mlp.fit(X_choco_train, y_choco_train)
y_mlp_choco_pred = choco_mlp.predict(X_choco_test)

cm = confusion_matrix(y_choco_test, y_mlp_choco_pred, labels=choco_mlp.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=choco_mlp.classes_)
disp.plot()
plt.show()

# K-Nearest Neighbors
choco_knn = KNeighborsClassifier(n_neighbors=3)
choco_knn.fit(X_choco_train, y_choco_train)
y_knn_choco_pred = choco_knn.predict(X_choco_test)

cm = confusion_matrix(y_choco_test, y_knn_choco_pred, labels=choco_knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=choco_knn.classes_)
disp.plot()
plt.show()

## ROC Curves for Chocolate

In [None]:
# Create a figure for the ROC curve
plt.figure(figsize=(10, 8))

# Function to plot ROC curve
def plot_roc_curve(fpr, tpr, label):
    plt.plot(fpr, tpr, linewidth=2, label=label)

# List of classifiers and their predictions
models = {
    "Decision Tree": y_tree_choco_pred,
    "Random Forest": y_random_forest_choco_pred,
    "SVM": y_svm_choco_pred,
    "Gradient Boosting": y_gb_choco_pred,
    "Multi-Layer Perceptron": y_mlp_choco_pred,
    "K-Nearest Neighbors": y_knn_choco_pred,
}

# Loop through each model to calculate and plot ROC curves
for name, predictions in models.items():
    # Use predict_proba for SVM and others that support it
    if name == "SVM":
        y_pred_proba = choco_svm.predict_proba(X_choco_test)[:, 1]  # Use probabilities
    else:
        y_pred_proba = predictions  # For other models, use the predicted classes

    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_choco_test, y_pred_proba)
    roc_auc = roc_auc_score(y_choco_test, y_pred_proba)
    plot_roc_curve(fpr, tpr, f"{name} (AUC = {roc_auc:.2f})")

# Finalize the ROC curve plot
plt.title("Receiver Operating Characteristic")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid()
plt.show()

## Model Training and Fitting for Mushrooms



In [None]:
# Single Decision Tree
mushroom_tree = tree.DecisionTreeClassifier(random_state=42)
mushroom_tree.fit(X_mushroom_train, y_mushroom_train)
y_tree_mushroom_pred = mushroom_tree.predict(X_mushroom_test)

cm = confusion_matrix(y_mushroom_test, y_tree_mushroom_pred, labels=mushroom_tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=mushroom_tree.classes_)
disp.plot()
plt.show()

# Random Forest Learner
mushroom_random_forest = RandomForestClassifier(random_state=42)
mushroom_random_forest.fit(X_mushroom_train, y_mushroom_train)
y_random_forest_mushroom_pred = mushroom_random_forest.predict(X_mushroom_test)

cm = confusion_matrix(y_mushroom_test, y_random_forest_mushroom_pred, labels=mushroom_random_forest.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=mushroom_random_forest.classes_)
disp.plot()
plt.show()

# SVM (Support Vector Machine)
mushroom_svm = svm.SVC(random_state=42, probability=True)
mushroom_svm.fit(X_mushroom_train, y_mushroom_train)
y_svm_mushroom_pred = mushroom_svm.predict(X_mushroom_test)

cm = confusion_matrix(y_mushroom_test, y_svm_mushroom_pred, labels=mushroom_svm.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=mushroom_svm.classes_)
disp.plot()
plt.show()

# Gradient Boosting Ensemble
mushroom_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
mushroom_gb.fit(X_mushroom_train, y_mushroom_train)
y_gb_mushroom_pred = mushroom_gb.predict(X_mushroom_test)

cm = confusion_matrix(y_mushroom_test, y_gb_mushroom_pred, labels=mushroom_gb.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=mushroom_gb.classes_)
disp.plot()
plt.show()

# Multi-Layer Perceptron
mushroom_mlp = MLPClassifier(random_state=1, max_iter=500)
mushroom_mlp.fit(X_mushroom_train, y_mushroom_train)
y_mlp_mushroom_pred = mushroom_mlp.predict(X_mushroom_test)

cm = confusion_matrix(y_mushroom_test, y_mlp_mushroom_pred, labels=mushroom_mlp.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=mushroom_mlp.classes_)
disp.plot()
plt.show()

# K-Nearest Neighbors
mushroom_knn = KNeighborsClassifier(n_neighbors=3)
mushroom_knn.fit(X_mushroom_train, y_mushroom_train)
y_knn_mushroom_pred = mushroom_knn.predict(X_mushroom_test)

cm = confusion_matrix(y_mushroom_test, y_knn_mushroom_pred, labels=mushroom_knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=mushroom_knn.classes_)
disp.plot()
plt.show()

## ROC Curves for Mushrooms

In [None]:
# Create a figure for the ROC curve
plt.figure(figsize=(10, 8))

# Function to plot ROC curve
def plot_roc_curve(fpr, tpr, label):
    plt.plot(fpr, tpr, linewidth=2, label=label)

# List of classifiers and their predictions
models = {
    "Decision Tree": y_tree_mushroom_pred,
    "Random Forest": y_random_forest_mushroom_pred,
    "SVM": y_svm_mushroom_pred,
    "Gradient Boosting": y_gb_mushroom_pred,
    "Multi-Layer Perceptron": y_mlp_mushroom_pred,
    "K-Nearest Neighbors": y_knn_mushroom_pred,
}

# Loop through each model to calculate and plot ROC curves
for name, predictions in models.items():
    # Use predict_proba for SVM and others that support it
    if name == "SVM":
        y_pred_proba = mushroom_svm.predict_proba(X_mushroom_test)[:, 1]  # Use probabilities
    else:
        y_pred_proba = predictions  # For other models, use the predicted classes

    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_mushroom_test, y_pred_proba)
    roc_auc = roc_auc_score(y_mushroom_test, y_pred_proba)
    plot_roc_curve(fpr, tpr, f"{name} (AUC = {roc_auc:.2f})")

# Finalize the ROC curve plot
plt.title("Receiver Operating Characteristic")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid()
plt.show()

## Class Balancing for Chocolate