In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


### Task 1: Read the dataset and do data pre-processing

In [None]:
# Read the dataset
df = pd.read_csv('drug200.csv')

In [None]:
print(df.head())

   Age Sex      BP Cholesterol  Na_to_K   Drug
0   23   F    HIGH        HIGH   25.355  DrugY
1   47   M     LOW        HIGH   13.093  drugC
2   47   M     LOW        HIGH   10.114  drugC
3   28   F  NORMAL        HIGH    7.798  drugX
4   61   F     LOW        HIGH   18.043  DrugY


In [None]:
print(df.tail())

     Age Sex      BP Cholesterol  Na_to_K   Drug
195   56   F     LOW        HIGH   11.567  drugC
196   16   M     LOW        HIGH   12.006  drugC
197   52   M  NORMAL        HIGH    9.894  drugX
198   23   M  NORMAL      NORMAL   14.020  drugX
199   40   F     LOW      NORMAL   11.349  drugX


In [None]:
df.describe(include='all')

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
count,200.0,200,200,200,200.0,200
unique,,2,3,2,,5
top,,M,HIGH,HIGH,,DrugY
freq,,104,77,103,,91
mean,44.315,,,,16.084485,
std,16.544315,,,,7.223956,
min,15.0,,,,6.269,
25%,31.0,,,,10.4455,
50%,45.0,,,,13.9365,
75%,58.0,,,,19.38,


In [None]:
#Handling the missing values
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [None]:
label_encoder=LabelEncoder()

In [None]:
# Split features and labels
X = df.drop('Drug', axis=1)
y = df['Drug']

In [None]:
# Perform label encoding on categorical variables
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y

array([0, 3, 3, 4, 0, 4, 0, 3, 0, 0, 3, 0, 0, 0, 4, 0, 4, 1, 3, 0, 0, 0,
       0, 0, 0, 0, 0, 4, 0, 0, 4, 2, 4, 0, 4, 4, 1, 4, 4, 4, 0, 2, 0, 4,
       4, 4, 1, 3, 0, 0, 0, 4, 0, 0, 2, 3, 2, 0, 4, 0, 0, 1, 0, 4, 2, 0,
       1, 4, 0, 0, 2, 0, 4, 0, 0, 0, 1, 0, 1, 4, 2, 4, 3, 1, 3, 2, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 3, 4, 0, 4, 4, 0, 2, 0,
       1, 4, 4, 4, 4, 0, 4, 4, 1, 0, 0, 0, 0, 0, 2, 0, 0, 4, 0, 4, 0, 0,
       4, 0, 0, 4, 2, 1, 2, 4, 1, 0, 2, 0, 1, 4, 4, 1, 4, 3, 1, 2, 4, 4,
       0, 3, 1, 0, 3, 4, 4, 2, 4, 0, 0, 0, 0, 4, 0, 1, 4, 4, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 4, 4, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0, 3, 0, 3, 3, 4,
       4, 4])

In [None]:
# Perform one-hot encoding on categorical features
X = pd.get_dummies(X)

In [None]:
X

Unnamed: 0,Age,Na_to_K,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
0,23,25.355,1,0,1,0,0,1,0
1,47,13.093,0,1,0,1,0,1,0
2,47,10.114,0,1,0,1,0,1,0
3,28,7.798,1,0,0,0,1,1,0
4,61,18.043,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
195,56,11.567,1,0,0,1,0,1,0
196,16,12.006,0,1,0,1,0,1,0
197,52,9.894,0,1,0,0,1,1,0
198,23,14.020,0,1,0,0,1,0,1


In [None]:
# Scale the numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [None]:
X

array([[-1.29159102,  1.28652212,  1.040833  , ..., -0.64686916,
         0.97043679, -0.97043679],
       [ 0.16269866, -0.4151454 , -0.96076892, ..., -0.64686916,
         0.97043679, -0.97043679],
       [ 0.16269866, -0.82855818, -0.96076892, ..., -0.64686916,
         0.97043679, -0.97043679],
       ...,
       [ 0.46567567, -0.85908883, -0.96076892, ...,  1.54590766,
         0.97043679, -0.97043679],
       [-1.29159102, -0.28650033, -0.96076892, ...,  1.54590766,
        -1.03046381,  1.03046381],
       [-0.26146916, -0.6571702 ,  1.040833  , ..., -0.64686916,
        -1.03046381,  1.03046381]])

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Task 2: Build the ANN model with (input layer, min 3 hidden layers & output layer)

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Build the ANN model
model = Sequential()

# Add input layer
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))

# Add hidden layers
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))

# Add output layer
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))


In [None]:
len(label_encoder.classes_)

5

### Task 3: Test the model with random data

In [None]:
import numpy as np

In [None]:
# Generate random data
np.random.seed()
random_data = np.random.rand(1, X_train.shape[1])

In [None]:
# Make predictions
predicted_class = np.argmax(model.predict(random_data))
predicted_drug = label_encoder.inverse_transform([predicted_class])[0]

print("Random Data:", random_data)
print("Predicted Drug:", predicted_drug)

Random Data: [[0.60791007 0.64696784 0.99085544 0.51128876 0.69178517 0.80441756
  0.38201367 0.65282146 0.7848421 ]]
Predicted Drug: drugC
