# Import Data and Packages

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
data=pd.read_csv('Final_Data_R_Manipulated.csv')

## Explore Data

In [4]:
print(data.head())

   PatientID PatientName Gender  Age Location ReasonForVisit  Duration  \
0          1   Patient 1   Male   56     West        Checkup       109   
1          1   Patient 1   Male   56     West      Follow-up        71   
2          1   Patient 1   Male   56     West      Emergency        76   
3          1   Patient 1   Male   56     West      Follow-up        35   
4          1   Patient 1   Male   56     West      Follow-up       140   

   ClinicID Specialization Age_group  LocationCode  GenderCode  VisitCode  \
0         8      Neurology     51-60             4           2          1   
1         9        General     51-60             4           2          2   
2        12      Neurology     51-60             4           2          3   
3        12      Neurology     51-60             4           2          2   
4        17     Cardiology     51-60             4           2          2   

   SpecializationCode  
0                   1  
1                   2  
2                   

In [5]:
data.columns.tolist()

['PatientID',
 'PatientName',
 'Gender',
 'Age',
 'Location',
 'ReasonForVisit',
 'Duration',
 'ClinicID',
 'Specialization',
 'Age_group',
 'LocationCode',
 'GenderCode',
 'VisitCode',
 'SpecializationCode']

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   PatientID           500 non-null    int64 
 1   PatientName         500 non-null    object
 2   Gender              500 non-null    object
 3   Age                 500 non-null    int64 
 4   Location            500 non-null    object
 5   ReasonForVisit      500 non-null    object
 6   Duration            500 non-null    int64 
 7   ClinicID            500 non-null    int64 
 8   Specialization      500 non-null    object
 9   Age_group           500 non-null    object
 10  LocationCode        500 non-null    int64 
 11  GenderCode          500 non-null    int64 
 12  VisitCode           500 non-null    int64 
 13  SpecializationCode  500 non-null    int64 
dtypes: int64(8), object(6)
memory usage: 54.8+ KB


In [7]:
data.isnull().sum()

PatientID             0
PatientName           0
Gender                0
Age                   0
Location              0
ReasonForVisit        0
Duration              0
ClinicID              0
Specialization        0
Age_group             0
LocationCode          0
GenderCode            0
VisitCode             0
SpecializationCode    0
dtype: int64

In [8]:
data.nunique()

PatientID              98
PatientName            98
Gender                  2
Age                    46
Location                4
ReasonForVisit          4
Duration              154
ClinicID               20
Specialization          5
Age_group               6
LocationCode            4
GenderCode              2
VisitCode               4
SpecializationCode      5
dtype: int64

In [9]:
data.dtypes

PatientID              int64
PatientName           object
Gender                object
Age                    int64
Location              object
ReasonForVisit        object
Duration               int64
ClinicID               int64
Specialization        object
Age_group             object
LocationCode           int64
GenderCode             int64
VisitCode              int64
SpecializationCode     int64
dtype: object

## Preparing Data for Modeling

### One-hot ecoding catagorical values

In [12]:
pd.get_dummies(data, columns=['Age', 'Duration', 'VisitCode', 'SpecializationCode', 'LocationCode'])

Unnamed: 0,PatientID,PatientName,Gender,Location,ReasonForVisit,ClinicID,Specialization,Age_group,GenderCode,Age_19,...,VisitCode_4,SpecializationCode_1,SpecializationCode_2,SpecializationCode_3,SpecializationCode_4,SpecializationCode_5,LocationCode_1,LocationCode_2,LocationCode_3,LocationCode_4
0,1,Patient 1,Male,West,Checkup,8,Neurology,51-60,2,False,...,False,True,False,False,False,False,False,False,False,True
1,1,Patient 1,Male,West,Follow-up,9,General,51-60,2,False,...,False,False,True,False,False,False,False,False,False,True
2,1,Patient 1,Male,West,Emergency,12,Neurology,51-60,2,False,...,False,True,False,False,False,False,False,False,False,True
3,1,Patient 1,Male,West,Follow-up,12,Neurology,51-60,2,False,...,False,True,False,False,False,False,False,False,False,True
4,1,Patient 1,Male,West,Follow-up,17,Cardiology,51-60,2,False,...,False,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,100,Patient 100,Female,South,Checkup,2,Cardiology,31-40,1,False,...,False,False,False,True,False,False,False,True,False,False
496,100,Patient 100,Female,South,Illness,4,Orthopedics,31-40,1,False,...,True,False,False,False,False,True,False,True,False,False
497,100,Patient 100,Female,South,Emergency,12,Neurology,31-40,1,False,...,False,True,False,False,False,False,False,True,False,False
498,100,Patient 100,Female,South,Emergency,13,Orthopedics,31-40,1,False,...,False,False,False,False,False,True,False,True,False,False


### Normalize numerical values

### Feature engineering (select relevant features)

In [15]:
X = data[['Age', 'SpecializationCode', 'VisitCode', 'LocationCode', 'Duration']]
y = data["GenderCode"] 

### Split data into training and testing sets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Standardize features

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Model evaluation function

In [21]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}, ROC-AUC: {roc_auc}")

### Experimenting with different models

In [23]:
print("Logistic Regression:")
logreg_model = LogisticRegression()
logreg_model.fit(X_train_scaled, y_train)
evaluate_model(logreg_model, X_test_scaled, y_test)

Logistic Regression:
Accuracy: 0.59, ROC-AUC: 0.525


In [24]:
print("Decision Tree:")
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)
evaluate_model(dt_model, X_test_scaled, y_test)

Decision Tree:
Accuracy: 0.83, ROC-AUC: 0.8291666666666666


In [25]:
print("Random Forest:")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
evaluate_model(rf_model, X_test_scaled, y_test)

Random Forest:
Accuracy: 0.7, ROC-AUC: 0.6708333333333333
