In [24]:
pip install ucimlrepo




In [25]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) #dataset id
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 
  
# metadata 
print(aids_clinical_trials_group_study_175.metadata) 
  
# variable information 
print(aids_clinical_trials_group_study_175.variables) 


{'uci_id': 890, 'name': 'AIDS Clinical Trials Group Study 175', 'repository_url': 'https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175', 'data_url': 'https://archive.ics.uci.edu/static/public/890/data.csv', 'abstract': 'The AIDS Clinical Trials Group Study 175 Dataset contains healthcare statistics and categorical information about patients who have been diagnosed with AIDS. This dataset was initially published in 1996. The prediction task is to predict whether or not each patient died within a certain window of time or not. ', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 2139, 'num_features': 23, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Sexual Orientation', 'Race', 'Gender'], 'target_col': ['cid'], 'index_col': ['pidnum'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1996, 'last_updated': 'Fri Nov 

In [27]:
import pandas as pd
df = pd.concat([X, y], axis=1)
df

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,cid
0,948,2,48,89.8128,0,0,0,100,0,0,...,0,1,0,1,0,422,477,566,324,0
1,1002,3,61,49.4424,0,0,0,90,0,1,...,1,3,0,1,0,162,218,392,564,1
2,961,3,45,88.4520,0,1,1,90,0,1,...,1,3,0,1,1,326,274,2063,1893,0
3,1166,3,47,85.2768,0,1,0,100,0,1,...,1,3,0,1,0,287,394,1590,966,0
4,1090,0,43,66.6792,0,1,0,100,0,1,...,1,3,0,0,0,504,353,870,782,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2134,1091,3,21,53.2980,1,0,0,100,0,1,...,1,3,0,1,1,152,109,561,720,0
2135,395,0,17,102.9672,1,0,0,100,0,1,...,1,3,0,0,1,373,218,1759,1030,0
2136,1104,2,53,69.8544,1,1,0,90,0,1,...,1,3,0,1,0,419,364,1391,1041,0
2137,465,0,14,60.0000,1,0,0,100,0,0,...,0,1,0,0,0,166,169,999,1838,1


Preprocessing: We can test p value to see if the variables significantly impact the cd420, and to decide where we keep the variables.

## Classification
Our purpose is to examine the performance of different types of AIDS treatments.

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report

# encode categorical variables, drop unrelated or dublicated features etc.
df1 = pd.get_dummies(df, columns=['trt', 'strat']) 
X = df1.drop(['cid'], axis=1)  # Features
y = df1['cid']  # Target variable

#df1['age_wtkg_interaction'] = df1['age'] * df1['wtkg']
#df1['cd4_difference'] = df1['cd420']-df1['cd40']
#df1['cd8_difference'] = df1['cd820']-df1['cd80']

# Perform feature selection using RandomForestClassifier.

In [29]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
feature_importances_sorted = feature_importances.sort_values(ascending=False)

# Create a DataFrame to store feature importances with ranking
feature_importances_df = pd.DataFrame({'Feature': feature_importances_sorted.index, 'Importance': feature_importances_sorted.values})
feature_importances_df

Unnamed: 0,Feature,Importance
0,time,0.36021
1,cd420,0.130927
2,cd40,0.068247
3,cd80,0.057626
4,wtkg,0.057553
5,cd820,0.053678
6,age,0.050141
7,offtrt,0.043444
8,preanti,0.042427
9,karnof,0.013966


Random forests and GBM are ensembles of decision trees and are not sensitive to feature scaling.

# Linear Regression model

In [8]:
# Define features (X) and target variable (y)
X = df1.drop(['cid','treat','str2'], axis=1)  # Features ('treat','str2' are dublicated variable)
y = df1['cid']  # Target variable

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


X = df1.drop('cd420', axis=1)  # Features (predictor variables)
y = df1['cd420']                # Target variable

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Linear Regression model
model = LinearRegression()

# Training the model
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model using Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)
r_squared = model.score(X_test, y_test)
print("R-squared score:", r_squared)

Root Mean Squared Error (RMSE): 92.80575296902236
R-squared score: 0.556151851963653




### Understand the results

RMSE ≈ 92.81: On average, the model's predictions are 92.81 units away from actual values.
<br>
R² ≈ 0.56: The model explains 55.6% of the variance in cd420, which is moderate (not perfect but acceptable).

## Perform classification using RandomForestClassifier.

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [150,200],   # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the trees
    'min_samples_split': [2],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1]      # Minimum number of samples required to be at a leaf node
}

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

# Print the best cross-validation score found by grid search
print("Best Cross-Validation Score:", grid_search.best_score_)

# Get the best model
best_classifier = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_classifier.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Check feature_importances to see which features are highly correlated with the target variable
# Determine which treatment is better based on the model's performance and feature importance

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best Cross-Validation Score: 0.8901215624094249
Accuracy: 0.8785046728971962
[[315  12]
 [ 40  61]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       327
           1       0.84      0.60      0.70       101

    accuracy                           0.88       428
   macro avg       0.86      0.78      0.81       428
weighted avg       0.88      0.88      0.87       428



The importances of the treatment options (trt_0, trt_1, trt_2, trt_3) are relatively low and close to each other. None of them stands out significantly as the most important feature. Therefore, based solely on feature importances, we can't conclusively determine which treatment is better.

In [31]:
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': best_classifier.feature_importances_})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

Feature Importances:
    Feature  Importance
0      time    0.379224
18    cd420    0.135809
17     cd40    0.072081
20    cd820    0.048432
19     cd80    0.048201
2      wtkg    0.047092
1       age    0.046159
16   offtrt    0.045579
10  preanti    0.045014
6    karnof    0.013226
14  symptom    0.012339
27  strat_3    0.010990
11     race    0.010481
4      homo    0.008881
8       z30    0.007595
12   gender    0.007514
3      hemo    0.007442
21    trt_0    0.007062
15    treat    0.006736
23    trt_2    0.006720
24    trt_3    0.006074
13     str2    0.005293
22    trt_1    0.005273
26  strat_2    0.004736
5     drugs    0.004704
25  strat_1    0.004577
7    oprior    0.002765
9    zprior    0.000000


# Logistic Regression

Perform classification using logistic regression. To penalize less important features in logistic regression, we will use L1 (Lasso) regularization.

We will perform feature scaling for logistic regession. Although logistic regression and neural networks are theoretically not sensitive to feature scaling, feature scaling can help improve convergence speed and performance in practice.

In [32]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000) 
log_reg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8504672897196262
[[311  16]
 [ 48  53]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       327
           1       0.77      0.52      0.62       101

    accuracy                           0.85       428
   macro avg       0.82      0.74      0.77       428
weighted avg       0.84      0.85      0.84       428



The coefficient of the treatments are low. 

In [22]:
coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': log_reg.coef_[0]})
# Print the DataFrame
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)
print(coef_df)

               Feature  Coefficient
17               cd820     0.283082
7                  z30     0.239190
9              preanti     0.176841
12             symptom     0.171146
28  age_group_above 50     0.155690
6               oprior     0.085557
1                 wtkg     0.071059
22             strat_1     0.057531
26     age_group_31-40     0.050896
3                 homo     0.037335
14                cd40     0.034451
18               trt_0     0.022927
11              gender     0.014025
25     age_group_19-30     0.000000
24             strat_3     0.000000
19               trt_1     0.000000
20               trt_2     0.000000
8               zprior     0.000000
27     age_group_41-50    -0.002103
21               trt_3    -0.055126
16                cd80    -0.061539
4                drugs    -0.077532
23             strat_2    -0.141778
2                 hemo    -0.150307
5               karnof    -0.209532
10                race    -0.315147
15               cd420    -0

# Naive bayes classifier

In [36]:
from sklearn.naive_bayes import GaussianNB

# Initialize and train the Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8084112149532711
[[288  39]
 [ 43  58]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       327
           1       0.60      0.57      0.59       101

    accuracy                           0.81       428
   macro avg       0.73      0.73      0.73       428
weighted avg       0.81      0.81      0.81       428



## Subgroup Analysis

In [16]:
# Define age groups based on specified ranges
def get_age_group(age):
    if age <= 18:
        return '0-18'
    elif 19 <= age <= 30:
        return '19-30'
    elif 31 <= age <= 40:
        return '31-40'
    elif 41 <= age <= 50:
        return '41-50'
    else:
        return 'above 50'

# Apply age group function to create a new column 'age_group'
df1['age_group'] = df1['age'].apply(get_age_group)
df1['age_group']

0          41-50
1       above 50
2          41-50
3          41-50
4          41-50
          ...   
2134       19-30
2135        0-18
2136    above 50
2137        0-18
2138       41-50
Name: age_group, Length: 2139, dtype: object

In [17]:
# Define features (X) and target variable (y)
df1['age_group'] = pd.Categorical(df1['age_group'], categories=['0-18', '19-30', '31-40', '41-50', 'above 50'], ordered=True)
df1 = pd.get_dummies(df1, columns=['age_group'], drop_first=True)
X = df1.drop(['cid', 'treat', 'str2', 'age'], axis=1)  # Features (excluding 'age' as it's replaced with 'age_group')
y = df1['cid']  # Target variable

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_classifier.fit(X_train, y_train)
y_pred = best_classifier.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8785046728971962
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       327
           1       0.84      0.60      0.70       101

    accuracy                           0.88       428
   macro avg       0.86      0.78      0.81       428
weighted avg       0.88      0.88      0.87       428



In [19]:
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': best_classifier.feature_importances_})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

#create the classfication report graph for visualization

Feature Importances:
               Feature  Importance
0                 time    0.384351
15               cd420    0.139519
14                cd40    0.074188
17               cd820    0.051710
1                 wtkg    0.049678
16                cd80    0.049205
9              preanti    0.044468
13              offtrt    0.043255
12             symptom    0.015033
5               karnof    0.014923
24             strat_3    0.011068
10                race    0.011068
3                 homo    0.010093
18               trt_0    0.009274
26     age_group_31-40    0.007952
2                 hemo    0.007816
25     age_group_19-30    0.007782
7                  z30    0.007738
22             strat_1    0.007644
19               trt_1    0.007201
20               trt_2    0.007049
27     age_group_41-50    0.006860
11              gender    0.006739
21               trt_3    0.006398
28  age_group_above 50    0.005650
4                drugs    0.005070
23             strat_2    0.004540