## Random forest

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load your data
df = pd.read_csv('train_hw03.csv')

# Calculate the mean activity for each brain region
brain_region_means = df.iloc[:, 4:].groupby(np.arange(len(df.columns[4:]))//115, axis=1).mean()

# Concatenate the brain region means with the other features
processed_data = pd.concat([df[['age', 'sex']], brain_region_means], axis=1)

# One-hot encode the 'sex' column
sex_onehot = pd.get_dummies(processed_data['sex'], prefix='sex')
processed_data = pd.concat([processed_data.drop('sex', axis=1), sex_onehot], axis=1)

# Separate features and target
X = processed_data
y = df['y']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data (excluding the one-hot encoded 'sex' columns)
scaler = StandardScaler()
X_train.loc[:, 'age':] = scaler.fit_transform(X_train.loc[:, 'age':])
X_test.loc[:, 'age':] = scaler.transform(X_test.loc[:, 'age':])

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

      autism       0.33      0.27      0.30        49
     control       0.56      0.63      0.59        71

    accuracy                           0.48       120
   macro avg       0.44      0.45      0.44       120
weighted avg       0.46      0.48      0.47       120





## Random forest (best parameters)

In [2]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to optimize
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize a RandomForestClassifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train a RandomForest classifier with the best parameters
model = RandomForestClassifier(**best_params, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

      autism       0.45      0.37      0.40        49
     control       0.61      0.69      0.65        71

    accuracy                           0.56       120
   macro avg       0.53      0.53      0.53       120
weighted avg       0.55      0.56      0.55       120





In [3]:
best_params

{'bootstrap': True,
 'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 100}

## XGBoost

In [None]:
#pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5
Note: you may need to restart the kernel to use updated packages.


In [7]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
le = LabelEncoder()

# Fit the label encoder and transform the target variable
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Now you can train your XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Because y_pred will also be numeric, you'll need to inverse transform it back to the original classes for the classification report
y_pred = le.inverse_transform(y_pred)
y_test = le.inverse_transform(y_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      autism       0.47      0.51      0.49        49
     control       0.64      0.61      0.62        71

    accuracy                           0.57       120
   macro avg       0.56      0.56      0.56       120
weighted avg       0.57      0.57      0.57       120



## XGBoost (best parameters)

In [10]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
le = LabelEncoder()

# Fit the label encoder and transform the target variable
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Define the hyperparameters to optimize
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0]
}

# Initialize XGBoost classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train a XGBoost classifier with the best parameters
model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Because y_pred will also be numeric, you'll need to inverse transform it back to the original classes for the classification report
y_pred = le.inverse_transform(y_pred)
y_test = le.inverse_transform(y_test)

# Evaluate the model
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

      autism       0.49      0.51      0.50        49
     control       0.65      0.63      0.64        71

    accuracy                           0.58       120
   macro avg       0.57      0.57      0.57       120
weighted avg       0.59      0.58      0.58       120

