# Neural Network model for drug-consumption dataset

In [254]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [91]:
# Importing the combined dataset
df = pd.read_csv("../data/df_tot.csv")

In [92]:
df.head()

Unnamed: 0,Age,Gender,Education,Country,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,...,CNS_stimulants,Cannabis,Nicotine,Hallucinogens,GDP,Politics,Poverty,Serious assault,Sexual violence,Unemployment_Rate
0,2,1,1,UK,39.0,36.0,42.0,37.0,42.0,-0.21712,...,2,0,2,0,40433.93305,0.362917,11.881818,134.043352,75.642036,6.503273
1,1,-1,2,UK,29.0,52.0,55.0,48.0,41.0,-0.71126,...,4,4,4,2,40433.93305,0.362917,11.881818,134.043352,75.642036,3.527909
2,2,-1,1,UK,31.0,45.0,40.0,32.0,34.0,-1.37983,...,0,3,0,1,40433.93305,0.362917,11.881818,134.043352,75.642036,6.948545
3,0,1,2,UK,34.0,34.0,46.0,47.0,46.0,-1.37983,...,2,2,2,0,40433.93305,0.362917,11.881818,134.043352,75.642036,3.315364
4,2,1,2,UK,43.0,28.0,43.0,41.0,50.0,-0.21712,...,1,3,2,2,40433.93305,0.362917,11.881818,134.043352,75.642036,3.315364


In [93]:
# Convert Country values to numerical with One-Hot encoding

country_dummies = pd.get_dummies(df["Country"])
df = pd.concat([df, country_dummies], axis=1)

# Drop the original 'Country' column 
df = df.drop("Country", axis=1)

# Converting booleans values with numerical 
pd.set_option('future.no_silent_downcasting', True)  # in a future version of pandas, .replace() will be deprecated.
df[country_dummies.columns] = df[country_dummies.columns].replace({True: 1, False: 0})
df[country_dummies.columns] = df[country_dummies.columns].astype(int)

df.head()

Unnamed: 0,Age,Gender,Education,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS,...,Politics,Poverty,Serious assault,Sexual violence,Unemployment_Rate,Australia,Canada,Ireland,UK,USA
0,2,1,1,39.0,36.0,42.0,37.0,42.0,-0.21712,-1.18084,...,0.362917,11.881818,134.043352,75.642036,6.503273,0,0,0,1,0
1,1,-1,2,29.0,52.0,55.0,48.0,41.0,-0.71126,-0.21575,...,0.362917,11.881818,134.043352,75.642036,3.527909,0,0,0,1,0
2,2,-1,1,31.0,45.0,40.0,32.0,34.0,-1.37983,0.40148,...,0.362917,11.881818,134.043352,75.642036,6.948545,0,0,0,1,0
3,0,1,2,34.0,34.0,46.0,47.0,46.0,-1.37983,-1.18084,...,0.362917,11.881818,134.043352,75.642036,3.315364,0,0,0,1,0
4,2,1,2,43.0,28.0,43.0,41.0,50.0,-0.21712,-0.21575,...,0.362917,11.881818,134.043352,75.642036,3.315364,0,0,0,1,0


In [94]:
df.dtypes

Age                    int64
Gender                 int64
Education              int64
Nscore               float64
Escore               float64
Oscore               float64
Ascore               float64
Cscore               float64
Impulsive            float64
SS                   float64
Alcohol                int64
CNS_depressants        int64
CNS_stimulants         int64
Cannabis               int64
Nicotine               int64
Hallucinogens          int64
GDP                  float64
Politics             float64
Poverty              float64
Serious assault      float64
Sexual violence      float64
Unemployment_Rate    float64
Australia              int32
Canada                 int32
Ireland                int32
UK                     int32
USA                    int32
dtype: object

In [95]:
# Re-ordering the columns
demographic = ['Australia', 'Canada', 'Ireland', 'UK', 'USA', 'Age', 'Gender','Education']
drugs = ['Alcohol', 'CNS_depressants','CNS_stimulants', 'Cannabis', 'Nicotine', 'Hallucinogens']
psycho_scores = ['Nscore', 'Escore', 'Oscore', 'Ascore','Cscore', 'Impulsive', 'SS']
politics = ['GDP','Politics', 'Poverty', 'Serious assault', 'Sexual violence','Unemployment_Rate']

new_order = demographic + psycho_scores + politics + drugs

df = df[new_order]
df.columns

Index(['Australia', 'Canada', 'Ireland', 'UK', 'USA', 'Age', 'Gender',
       'Education', 'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore',
       'Impulsive', 'SS', 'GDP', 'Politics', 'Poverty', 'Serious assault',
       'Sexual violence', 'Unemployment_Rate', 'Alcohol', 'CNS_depressants',
       'CNS_stimulants', 'Cannabis', 'Nicotine', 'Hallucinogens'],
      dtype='object')

## Data scaling -  Standardization

In [161]:
# Standardizing only columns with continuous numerical values. 
# The other columns not to be standardized are either binary or categorical ordinal values converted to numerical with labeling

# Creating a copy of the original dataset
df_standard = df.copy()

col_to_standard = ['Nscore', 'Escore', 'Oscore', 'Ascore','Cscore', 'Impulsive', 'SS',
                   'GDP','Politics', 'Poverty', 'Serious assault', 'Sexual violence','Unemployment_Rate']


scaler = StandardScaler()
df_standard[col_to_standard] = scaler.fit_transform(df_standard[col_to_standard])



In [162]:
df_standard.head()

Unnamed: 0,Australia,Canada,Ireland,UK,USA,Age,Gender,Education,Nscore,Escore,...,Poverty,Serious assault,Sexual violence,Unemployment_Rate,Alcohol,CNS_depressants,CNS_stimulants,Cannabis,Nicotine,Hallucinogens
0,0,0,0,1,0,2,1,1,0.335075,-0.536787,...,-0.674831,-0.686607,0.822048,0.059637,5,2,2,0,2,0
1,0,0,0,1,0,1,-1,2,-0.764166,1.822232,...,-0.674831,-0.686607,0.822048,-0.941827,5,2,4,4,4,2
2,0,0,0,1,0,2,-1,1,-0.544318,0.790161,...,-0.674831,-0.686607,0.822048,0.20951,6,0,0,3,0,1
3,0,0,0,1,0,0,1,2,-0.214545,-0.831665,...,-0.674831,-0.686607,0.822048,-1.013367,4,3,2,2,2,0
4,0,0,0,1,0,2,1,2,0.774772,-1.716297,...,-0.674831,-0.686607,0.822048,-1.013367,4,1,1,3,2,2


In [237]:
# Save standardized df

df_standard.to_csv("../data/Dataset_standard.csv")

In [313]:
# Convert all values of the target columns to binary (used/not-used)
# 0 "Non-user" (from 0 to 2) or 1 for "User" individual (from 3 to 6)

target_columns = ['Alcohol', 'CNS_depressants','CNS_stimulants', 'Cannabis', 'Nicotine', 'Hallucinogens']

df_binary = df_standard.copy()

df_binary[target_columns] = df_binary[target_columns]

for index, row in df_binary[target_columns].iterrows():
    # Check each drug column value for the current row
    for drug_column, value in row.items():
        if 3 <= value <=6:
            df_binary.at[index, drug_column] = 1
        else:
            df_binary.at[index, drug_column] = 0

In [314]:
# Define features (X) and target (y)
X = df_binary.drop(columns = target_columns, axis = 1)
y = df_binary[target_columns]

In [315]:
# Convert the df to numpy arrays

X = X.values
y = y.values

In [402]:
# Splitting the dataset to train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [403]:
# Convert X features and y numpy Arrays to TENSORS

X_train = torch.FloatTensor(X_train) # The features are floats and integers
X_test = torch.FloatTensor(X_test)

y_train = torch.FloatTensor(y_train) # LongTensors are Integers
y_test = torch.FloatTensor(y_test)

## Neural Net Model 

- 21 features
- 6 targets/classes (Alcohol, CNS_depressants, CNS_stimulants, Cannabis, Nicotine, Hallucinogens) - binary values

In [419]:
# Hyperparameters

input_size = 21 # number of features
hidden_size1 = 120 # number of neurons in the first hidden layer
hidden_size2 = 100 # number of neurons in the second hidden layer
output_size = 6 # number of classes (output neurons)
learning_rate = 0.01

In [420]:
# Including batch to improve training
dataset = TensorDataset(X_train, y_train)

# Define the DataLoader
batch_size = 100
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [421]:
# Create a Model Class that inherits nn.Module

class NeuralNet(nn.Module):
    
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(NeuralNet, self).__init__() # Instantiate out nn.Model 
        # Initiating the layers (fc = fully connected)
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.out = nn.Linear(hidden_size2, output_size)
        self.sigmoid = nn.Sigmoid() # activation function for binary classification, squashes the output values between 0 and 1
        
    def forward(self, x):  # Define the function to move forward in the network
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = self.out(out)
        out = self.sigmoid(out)
    
        return out

torch.manual_seed(40)    
model = NeuralNet(input_size, hidden_size1, hidden_size2, output_size)
print(f"Model architecture:{model}")

Model architecture:NeuralNet(
  (fc1): Linear(in_features=21, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=6, bias=True)
  (sigmoid): Sigmoid()
)


In [422]:
# Loss and Optimizer definition

criterion = nn.BCELoss() # binary cross entropy loss
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [423]:
# Training loop 
num_epochs = 150

for epoch in range(1, num_epochs + 1):
    for inputs, targets in dataloader:
        # Forward pass
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    # Print loss every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 10, Loss: 0.34752175211906433
Epoch: 20, Loss: 0.4064893424510956
Epoch: 30, Loss: 0.2312968373298645
Epoch: 40, Loss: 0.22130347788333893
Epoch: 50, Loss: 0.20894403755664825
Epoch: 60, Loss: 0.10928468406200409
Epoch: 70, Loss: 0.0771176666021347
Epoch: 80, Loss: 0.0675385445356369
Epoch: 90, Loss: 0.057024575769901276
Epoch: 100, Loss: 0.012363183312118053
Epoch: 110, Loss: 0.005349261220544577
Epoch: 120, Loss: 0.0048149703070521355
Epoch: 130, Loss: 0.0014799109194427729
Epoch: 140, Loss: 0.003182952292263508
Epoch: 150, Loss: 0.0014624696923419833


In [431]:
# Now only forward in the model with the test set

with torch.no_grad():
    y_eval = model.forward(X_test) # X_test are features from the test set, y_eval will be predictions
    loss = criterion(y_eval, y_test) # Find the loss for the test set 

class_labels = {
    0: 'Alcohol',
    1: 'CNS_depressants',
    2: 'CNS_stimulants',
    3: 'Cannabis',
    4: 'Nicotine',
    5: 'Hallucinogens'
}

correct_per_class = [0] * 6  # Initialize a list to store the number of correct predictions for each class

with torch.no_grad():
    for i in range(len(y_test)):
        # Get the predicted values for the current sample
        y_val = model(X_test[i])

        # Convert predicted and target values to integers (0 or 1)
        y_pred_class = torch.round(y_val).to(torch.int)  # Round to the nearest integer (0 or 1)
        y_true_class = y_test[i].to(torch.int)  # Convert to integer tensor
        
        # Convert integer labels to string labels
        y_true_labels = ["Non-User" if val == 0 else "User" for val in y_true_class]
        y_pred_labels = ["Non-User" if val == 0 else "User" for val in y_pred_class]

        # Check if each element of the predicted and target tensors is equal
        for j in range(6):  # 6 classes (drugs)
            if y_pred_class[j] == y_true_class[j]:
                correct_per_class[j] += 1

        print(f"Sample {i+1}: True: {y_true_labels}, Predicted: {y_pred_labels}")

print()
print("Correct predictions per class (accuracy):")
for i, correct_count in enumerate(correct_per_class):
    accuracy = correct_count / len(y_test)  # Calculate accuracy for the class
    label = class_labels[i]  # Get the label for the current class index
    print(f"{label}: {accuracy:.2%}")

Sample 1: True: ['User', 'User', 'User', 'User', 'User', 'Non-User'], Predicted: ['User', 'User', 'User', 'User', 'User', 'Non-User']
Sample 2: True: ['User', 'Non-User', 'Non-User', 'Non-User', 'Non-User', 'Non-User'], Predicted: ['User', 'User', 'Non-User', 'Non-User', 'Non-User', 'Non-User']
Sample 3: True: ['User', 'Non-User', 'Non-User', 'Non-User', 'Non-User', 'Non-User'], Predicted: ['User', 'Non-User', 'Non-User', 'Non-User', 'Non-User', 'Non-User']
Sample 4: True: ['User', 'User', 'User', 'User', 'User', 'User'], Predicted: ['User', 'User', 'Non-User', 'User', 'User', 'Non-User']
Sample 5: True: ['User', 'User', 'User', 'User', 'User', 'Non-User'], Predicted: ['User', 'Non-User', 'Non-User', 'Non-User', 'Non-User', 'Non-User']
Sample 6: True: ['User', 'Non-User', 'User', 'User', 'User', 'User'], Predicted: ['User', 'Non-User', 'User', 'User', 'User', 'User']
Sample 7: True: ['User', 'Non-User', 'Non-User', 'User', 'User', 'Non-User'], Predicted: ['User', 'Non-User', 'User', 'U

In [388]:
# Save the Model with the hyperparameters leading to the highest accuracy

class NeuralNet(nn.Module):
    
    def __init__(self, input_size=21, hidden_size1=120, hidden_size2=100, output_size=6):
        super(NeuralNet, self).__init__() # Instantiate out nn.Model 
        # Initiating the layers (fc = fully connected)
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.out = nn.Linear(hidden_size2, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):  # Define the function to move forward in the network
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = self.out(out)
        out = self.sigmoid(out)
    
        return out

torch.manual_seed(40)    
model = NeuralNet()
print(f"Model architecture:{model}")

Model architecture:NeuralNet(
  (fc1): Linear(in_features=21, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=6, bias=True)
  (sigmoid): Sigmoid()
)


In [389]:
torch.save(model.state_dict(), "NeuralNet_drugs.pt")