# Hack! Template

Please use this template to code.

- Remove libraries and some lines as necessary.


### Load libraries

In [None]:
import networkx as nx
import psutil, time, os, gc, statistics, warnings
import pandas as pd
import numpy as np
import glob
from memory_profiler import memory_usage
from codecarbon import EmissionsTracker
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

### Functions for normalization and tracking memory usage

In [None]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / 1024 / 1024  # Memory in MB

# Function to normalize metrics
def normalize_metric(values, maximize=True):
    if not values:
        return [1.0] * len(values)
    min_val = min(values)
    max_val = max(values)
    if max_val == min_val:
        return [1.0] * len(values)
    if maximize:
        return [(x - min_val) / (max_val - min_val) for x in values]
    return [(max_val - x) / (max_val - min_val) for x in values]

### Function plotting the ML output

In [None]:
# TODO: Students, implement the plot_validation_curve function
# Instructions: This function should create a validation curve plot for a given estimator.
# - Use validation_curve to compute training and test scores for different parameter values
# - Plot mean training and test scores with standard deviation as shaded areas
# - Ensure the plot is properly labeled with title, x-axis, y-axis, legend, and grid
# - Handle None values in param_range by converting them to a large integer (e.g., 100) for plotting
def plot_validation_curve(estimator, X, y, param_name, param_range, cv=5, title="Validation Curve"):
    # Replace this with your implementation
    

### Functions for iterating ML functions

In [None]:
# Function to run analysis with multiple iterations
def run_analysis_with_repeats(data_dir, num_runs=1):
    results = []
    data_rows = None
    data_targets = None
    
    for _ in range(num_runs):
        gc.collect()  # Clear memory before each run
        result, rows, targets = analyze_ml_models(data_dir)
        results.append(result)
        data_rows = rows
        data_targets = targets
    
    # Aggregate metrics dynamically
    aggregated_results = []
    for method in ['LR', 'RF', 'DT']: # Replace with actual method names used in results
        metrics = {
            "Method": method,
            "Accuracy": statistics.mean([r[f"{method} Accuracy"] for r in results]),
            "Train Time (s)": statistics.mean([r[f"{method} Train Time (s)"] for r in results]),
            "Pred Time (s)": statistics.mean([r[f"{method} Pred Time (s)"] for r in results]),
            "Memory (MB)": statistics.mean([r[f"{method} Memory (MB)"] for r in results]),
            "Emissions (kg CO2eq)": statistics.mean([r[f"{method} Emissions (kg CO2eq)"] for r in results]),
            "Composite Score": statistics.mean([r[f"{method} Composite Score"] for r in results])
        }
        aggregated_results.append(metrics)
    
    return aggregated_results, (pd.DataFrame(data_rows), pd.Series(data_targets))

# TODO: Students, implement the analyze_ml_models function
# Instructions: This function processes graph data, trains ML models, and evaluates them
# Steps:
# 1. Read graph files from data_dir using networkx, compute onion layers, and create features
# 2. Preprocess data: convert to DataFrame, handle missing/non-finite values, scale features
# 3. Split data into training and test sets (50% split, random_state=33)
# 4. Define models (LogisticRegression, RandomForestClassifier, DecisionTreeClassifier) with GridSearchCV
# 5. Train models, measure training/prediction time, memory usage, and emissions using EmissionsTracker
# 6. Compute accuracy, confusion matrix, and classification report for each model
# 7. Normalize metrics and compute composite score using weights
# 8. Return results dictionary, feature DataFrame, and targets
def analyze_ml_models(data_dir):
    rows = []
    targets = []
    
    # Step 1: Read and process graph data
    # Hint: Use nx.read_adjlist and nx.onion_layers
    # Create a feature vector for each graph based on layer sizes (1 to 100)
    
    # Step 2: Preprocess data
    # Hint: Create DataFrame, handle missing/non-finite values, use StandardScaler
    
    # Step 3: Split data
    # Hint: Use train_test_split with train_size=0.5, random_state=33
    
    # Step 4: Define models and parameter grids
    models = [
        # Define LogisticRegression with param_grid for 'C'
        # Define RandomForestClassifier with param_grid for 'n_estimators', 'max_depth', 'min_samples_split'
        # Define DecisionTreeClassifier with param_grid for 'max_depth', 'min_samples_split'
    ]
    
    results = {}
    raw_metrics = { 'accuracy': [], 'train_time': [], 'pred_time': [], 'memory': [], 'emissions': [] }
    
    # Step 5-7: Train, evaluate, and compute metrics for each model
    # Hint: Use GridSearchCV, memory_usage, EmissionsTracker, accuracy_score, etc.
    
    # Step 8: Return results
    return results, rows, targets

### Main function

In [None]:
def main():
    data_dir = "data"
    
    if not os.path.exists(data_dir):
        print(f"Error: {data_dir} directory not found.")
        return
    
    num_runs = 5  # Set to 1 for single run as in original code
    ml_metrics, (X, y) = run_analysis_with_repeats(data_dir, num_runs)
    
    print("\nTable: Machine Learning Performance Comparison")
    ml_df = pd.DataFrame(ml_metrics)
    print(ml_df.round(6))
    ml_df.to_csv("ml_analysis_comparison.csv", index=False)
    print("\nMachine learning results saved to ml_analysis_comparison.csv")
    
    # TODO: Validation curves
    # modify the following code to plot validation curves for each model used in the analysis
    lr = LogisticRegression(class_weight='balanced')
    plot_validation_curve(lr, X, y, param_name='C', param_range=[0.01, 0.1, 1, 10, 100], title="Validation Curve for Logistic Regression")
    
    rf = RandomForestClassifier(class_weight='balanced', random_state=33)
    plot_validation_curve(rf, X, y, param_name='max_depth', param_range=[None, 5, 10, 15, 20], title="Validation Curve for Random Forest (max_depth)")
    
    dt = DecisionTreeClassifier(class_weight='balanced', random_state=33)
    plot_validation_curve(dt, X, y, param_name='max_depth', param_range=[None, 5, 10, 15, 20], title="Validation Curve for Decision Tree (max_depth)")
    
    

if __name__ == "__main__":
    main()