# Darwin Dataset

## Optimization and Baseline Model Creation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Step 2: Load the dataset
file_path = r"/content/data.csv"
data = pd.read_csv(file_path)

# Step 3: Basic data exploration
print("Dataset Info:")
print(data.info())
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Step 4: Handling missing values
# Checking for missing values
missing_values = data.isnull().sum()

# Display columns with missing values
print("\nColumns with missing values:")
print(missing_values[missing_values > 0])

# Strategy 1: If a column has more than 50% missing values, drop it
threshold = 0.5 * len(data)  # 50% of the row count
data = data.dropna(axis=1, thresh=threshold)

# Strategy 2: For numerical columns, fill missing values with the mean
numerical_columns = data.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
    data[column] = data[column].fillna(data[column].mean())

# Strategy 3: For categorical columns, fill missing values with the mode
categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode()[0])

# Verify if missing values are handled
print("\nMissing values after handling:")
print(data.isnull().sum().sum())

print("Contents of the 'class' column:")
print(data['class'].unique())

# Step 5: Separate features and target variable
X = data.drop(['ID', 'class'], axis=1)  # Drop non-numeric and target columns
y = data['class']

# Encode target variable if it's categorical
le = LabelEncoder()
y = le.fit_transform(y)

# Step 6: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 8: Build a baseline model (Random Forest for example)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 9: Make predictions
y_pred = model.predict(X_test)

# Step 10: Evaluate the model
accuracy_baseline = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy_baseline * 100:.2f}%")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importance (Optional)
importances = model.feature_importances_
feature_importance = pd.Series(importances, index=data.drop(['ID', 'class'], axis=1).columns)
print("\nTop 10 Important Features:")
print(feature_importance.sort_values(ascending=False).head(10))

# Step 11: Store model performance for comparison
# Define baseline model's performance metrics for comparison
baseline_r2 = accuracy_baseline  # Using accuracy in place of R2 score for classification tasks
baseline_features = len(model.feature_importances_)

model_performances = {
    'Baseline': {'R2 Score': baseline_r2, 'Number of Features': baseline_features},
}

# Step 12: Print model performances
print("\nModel Performances:")
for model_name, performance in model_performances.items():
    print(f"{model_name}: R2 Score = {performance['R2 Score']}, Number of Features = {performance['Number of Features']}")


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Columns: 452 entries, ID to class
dtypes: float64(300), int64(150), object(2)
memory usage: 614.6+ KB
None

First 5 rows of the dataset:
     ID  air_time1  disp_index1  gmrt_in_air1  gmrt_on_paper1  \
0  id_1       5160     0.000013    120.804174       86.853334   
1  id_2      51980     0.000016    115.318238       83.448681   
2  id_3       2600     0.000010    229.933997      172.761858   
3  id_4       2130     0.000010    369.403342      183.193104   
4  id_5       2310     0.000007    257.997131      111.275889   

   max_x_extension1  max_y_extension1  mean_acc_in_air1  mean_acc_on_paper1  \
0               957              6601          0.361800            0.217459   
1              1694              6998          0.272513            0.144880   
2              2333              5802          0.387020            0.181342   
3              1756              8159          0.556879            0.1

In [5]:
print(data['class'].unique())

['P' 'H']


## Applying Stochastic Diffusion Search (SDS)

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from random import sample

# Load dataset
file_path = r"/content/data.csv"
data = pd.read_csv(file_path)

# Handle missing values (similar to previous steps)
numerical_columns = data.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
    data[column] = data[column].fillna(data[column].mean())

categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode()[0])

# Separate features and target variable
X = data.drop(['ID', 'class'], axis=1)  # Adjust columns as needed
y = data['class']

# Encode target variable if necessary
le = LabelEncoder()
y = le.fit_transform(y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SDS Algorithm Parameters
n_agents = 20  # Number of agents
iterations = 50  # Number of iterations
population_size = X_scaled.shape[1]  # Number of features
n_selected_features = 10  # Number of features each agent selects
success_threshold = 0.75  # Success threshold for feature selection

# Initialize agents (each agent selects a random subset of features)
agents = [sample(range(population_size), n_selected_features) for _ in range(n_agents)]

def evaluate_agent(agent_features):
    """Train a model using the features selected by the agent and return its accuracy."""
    X_selected = X_scaled[:, agent_features]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Stochastic Diffusion Search process
for iteration in range(iterations):
    print(f"Iteration {iteration + 1}/{iterations}")
    agent_accuracies = []

    # Evaluate each agent's hypothesis (selected features)
    for i, agent in enumerate(agents):
        accuracy = evaluate_agent(agent)
        agent_accuracies.append(accuracy)

    # Communication phase: Diffusion of information between agents
    for i in range(n_agents):
        if agent_accuracies[i] < success_threshold:
            # The agent was not successful, so it copies a better agent's hypothesis
            better_agent_index = np.argmax(agent_accuracies)
            agents[i] = agents[better_agent_index]

# After iterations, the best agent is selected based on accuracy
best_agent_index = np.argmax(agent_accuracies)
best_agent_features = agents[best_agent_index]
print(f"\nBest selected features by SDS (Indices): {best_agent_features}")
print(f"Feature names: {X.columns[best_agent_features]}")

# Train a final model with the selected features
X_final_selected = X_scaled[:, best_agent_features]
X_train, X_test, y_train, y_test = train_test_split(X_final_selected, y, test_size=0.2, random_state=42)

final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

# Evaluate final model performance
sds_r2 = accuracy_score(y_test, y_pred)  # Use accuracy as classification metric
sds_features = len(best_agent_features)  # Number of selected features by the best agent

# Define the model performance dictionary for R2 Score and Number of Features
sds_performance = {
    'R2 Score': sds_r2,
    'Number of Features': sds_features
}

print(f"\nFinal Model Accuracy after SDS: {sds_r2 * 100:.2f}%")
print(f"Number of Features Selected by SDS: {sds_features}")

# Print classification report and confusion matrix if needed
from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print SDS performance metrics
print("\nSDS Model Performance:")
print(sds_performance)


Iteration 1/50
Iteration 2/50
Iteration 3/50
Iteration 4/50
Iteration 5/50
Iteration 6/50
Iteration 7/50
Iteration 8/50
Iteration 9/50
Iteration 10/50
Iteration 11/50
Iteration 12/50
Iteration 13/50
Iteration 14/50
Iteration 15/50
Iteration 16/50
Iteration 17/50
Iteration 18/50
Iteration 19/50
Iteration 20/50
Iteration 21/50
Iteration 22/50
Iteration 23/50
Iteration 24/50
Iteration 25/50
Iteration 26/50
Iteration 27/50
Iteration 28/50
Iteration 29/50
Iteration 30/50
Iteration 31/50
Iteration 32/50
Iteration 33/50
Iteration 34/50
Iteration 35/50
Iteration 36/50
Iteration 37/50
Iteration 38/50
Iteration 39/50
Iteration 40/50
Iteration 41/50
Iteration 42/50
Iteration 43/50
Iteration 44/50
Iteration 45/50
Iteration 46/50
Iteration 47/50
Iteration 48/50
Iteration 49/50
Iteration 50/50

Best selected features by SDS (Indices): [55, 308, 229, 178, 270, 417, 343, 440, 322, 25]
Feature names: Index(['disp_index4', 'gmrt_in_air18', 'num_of_pendown13', 'pressure_var10',
       'air_time16', 'gmrt

## Applying Principal Component Analysis (PCA)

In [7]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load the dataset
file_path = r"/content/data.csv"
data = pd.read_csv(file_path)

# Separate features and target variable
target_column = 'class'  # Replace 'class' with the actual target column name
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target

# Drop non-numeric columns in features (e.g., 'ID')
X_numeric = X.select_dtypes(include=[np.number])

# Standardize the numeric data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Apply PCA
pca = PCA()
pca.fit(X_scaled)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
pca_features = np.argmax(cumulative_variance >= 0.95) + 1  # Selecting features that explain 95% of variance

# Applying PCA with the optimal number of components
pca_optimal = PCA(n_components=pca_features)
X_pca = pca_optimal.fit_transform(X_scaled)

# Calculate R² score
pca_r2 = r2_score(X_scaled, pca_optimal.inverse_transform(X_pca))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Classification model (Random Forest)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predictions
y_pred = classifier.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Calculate and store accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy_variable = accuracy  # Store in a variable for future use

# Output results
print("R² Score (pca_r2):", pca_r2)
print("Number of Features (pca_features):", pca_features)
print("Accuracy (Stored in Variable): {:.2f}%".format(accuracy_variable * 100))
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)


R² Score (pca_r2): 0.95052617856783
Number of Features (pca_features): 101
Accuracy (Stored in Variable): 84.91%
Confusion Matrix:
 [[21  7]
 [ 1 24]]

Classification Report:
               precision    recall  f1-score   support

           H       0.95      0.75      0.84        28
           P       0.77      0.96      0.86        25

    accuracy                           0.85        53
   macro avg       0.86      0.85      0.85        53
weighted avg       0.87      0.85      0.85        53



## Applying Recursive Feature Elimination (RFE)

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
file_path = r"/content/data.csv"
data = pd.read_csv(file_path)

# Handle missing values (as per previous method)
numerical_columns = data.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
    data[column] = data[column].fillna(data[column].mean())

categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode()[0])

# Separate features and target variable
X = data.drop(['ID', 'class'], axis=1, errors='ignore')  # Adjust columns as needed
y = data['class']

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a baseline model (Random Forest Classifier)
baseline_model = RandomForestClassifier(random_state=42)

# Use RFE to recursively eliminate features
n_features_to_select = 15  # Adjust number of features to select
rfe = RFE(estimator=baseline_model, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

# Get the selected features
selected_features = np.where(rfe.support_)[0]
print(f"Selected feature indices: {selected_features}")

# Use selected features for training and testing
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Hyperparameter tuning using GridSearchCV for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],       # Number of trees in the forest
    'max_depth': [10, 20, 30, None],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the final model with the best hyperparameters
final_model = RandomForestClassifier(random_state=42, **best_params)
final_model.fit(X_train_selected, y_train)
y_pred = final_model.predict(X_test_selected)

# Final model accuracy (RFEfinal_r2_score)
rfe_r2 = accuracy_score(y_test, y_pred)
print(f"\nFinal Model Accuracy after RFE and Hyperparameter Tuning (R2 Score equivalent): {rfe_r2 * 100:.2f}%")

# Number of selected features
rfe_features = len(selected_features)
print(f"Number of Features Selected by RFE: {rfe_features}")

# Classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Define R2 Score and Number of Features
results = {'R2 Score': rfe_r2, 'Number of Features': rfe_features}
print("\nResults:", results)


Selected feature indices: [ 90 107 125 161 216 233 252 269 302 305 337 340 392 396 413]
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Final Model Accuracy after RFE and Hyperparameter Tuning (R2 Score equivalent): 85.71%
Number of Features Selected by RFE: 15

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.93      0.85        15
           1       0.94      0.80      0.86        20

    accuracy                           0.86        35
   macro avg       0.86      0.87      0.86        35
weighted avg       0.87      0.86      0.86        35


Confusion Matrix:
[[14  1]
 [ 4 16]]

Results: {'R2 Score': 0.8571428571428571, 'Number of Features': 15}


## Applying Filter Method - Mutual Information

In [9]:
pip install mlxtend




In [10]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Mutual Information Feature Selection (Filter Method)
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
mi_scores_series = pd.Series(mi_scores, index=data.drop(['ID', 'class'], axis=1).columns)

# Rank features by Mutual Information scores
mi_scores_sorted = mi_scores_series.sort_values(ascending=False)
print("Top 10 Features by Mutual Information:\n", mi_scores_sorted.head(10))

# Select the top N features based on MI scores
N = 20  # Select top 20 features
top_features = mi_scores_sorted.head(N).index
X_top_features = data[top_features]

# Split the dataset using selected features
X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size=0.2, random_state=42)

# Train and test a Random Forest classifier with selected features
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate model performance
mi_accuracy = accuracy_score(y_test, y_pred)  # Updated variable name to 'mi_accuracy' for consistency
print(f"\nAccuracy with Top {N} Features (MI): {mi_accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Top 10 Features by Mutual Information:
 max_x_extension19        0.254768
mean_speed_on_paper10    0.253806
total_time15             0.246109
total_time9              0.242358
total_time23             0.237621
paper_time23             0.237520
total_time25             0.231919
total_time17             0.231053
air_time19               0.221133
air_time23               0.219521
dtype: float64

Accuracy with Top 20 Features (MI): 91.43%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90        15
           1       0.95      0.90      0.92        20

    accuracy                           0.91        35
   macro avg       0.91      0.92      0.91        35
weighted avg       0.92      0.91      0.91        35


Confusion Matrix:
[[14  1]
 [ 2 18]]


## Model Performance Comparison and Visualization of Models

In [11]:
import plotly.graph_objects as go

# Create dictionary for model performances
model_performances = {
    'Baseline': {'Accuracy': accuracy_baseline, 'Number of Features': baseline_features},
    'SDS': {'Accuracy': sds_r2, 'Number of Features': sds_features},
    'PCA': {'Accuracy': accuracy_variable, 'Number of Features': pca_features},
    'RFE': {'Accuracy': rfe_r2, 'Number of Features': rfe_features},

}

# Create the figure
fig = go.Figure()

# Add bar chart for Accuracy (left y-axis)
fig.add_trace(go.Bar(
    x=list(model_performances.keys()),  # model names on x-axis
    y=[model_performances[model]['Accuracy'] * 100 for model in model_performances],  # Accuracy in percentage
    name="Accuracy (%)",
    marker_color=["#FF7F0E", "#1F77B4", "#2CA02C", "#D62728"]  # Distinct colors
))

# Add line chart for Number of Features (right y-axis)

fig.add_trace(go.Scatter(
    x=list(model_performances.keys()),  # model names on x-axis
    y=[model_performances[model]['Number of Features'] for model in model_performances],  # Feature counts on y-axis
    name="Number of Features",
    yaxis='y2',
    mode='lines+markers'
))

# Layout adjustments
fig.update_layout(
    title="Comparison of Model Performances",
    xaxis=dict(title='Model'),
    yaxis=dict(
        title='Accuracy (%)',
        range=[0, 100]  # Set range for Accuracy in percentage
    ),
    yaxis2=dict(
        title='Number of Features',
        overlaying='y',
        side="right",
        showgrid=False  # Hide secondary grid
    ),
    legend=dict(
        x=1.03,  # Legend outside the graph
        y=1.3,
        xanchor="left",
        yanchor="top",
        bordercolor="Black",
        borderwidth=1
    ),
    template='plotly_white',  # Clean white background
    margin=dict(r=200)  # Adjust right margin for legend
)

# Show plot
fig.show()


In [12]:
import plotly.graph_objects as go

# Create dictionary for model performances
model_performances = {
    'Baseline': {'Accuracy': accuracy_baseline, 'Number of Features': baseline_features},
    'SDS': {'Accuracy': sds_r2, 'Number of Features': sds_features},          # Keeping original variable names
    'PCA': {'Accuracy': accuracy_variable, 'Number of Features': pca_features},
    'RFE': {'Accuracy': rfe_r2, 'Number of Features': rfe_features},
    'MIFS': {'Accuracy': mi_accuracy, 'Number of Features': len(top_features)}  # Mutual Information performance
}

# Create the figure
fig = go.Figure()

# Add bar chart for Accuracy (left y-axis)
fig.add_trace(go.Bar(
    x=list(model_performances.keys()),  # Model names on x-axis
    y=[model_performances[model]['Accuracy'] * 100 for model in model_performances],  # Accuracy in percentage
    name="Accuracy (%)",
    marker_color=["#FF7F0E", "#1F77B4", "#2CA02C", "#D62728", "#9467BD"]  # Distinct colors
))

# Add line chart for Number of Features (right y-axis)
fig.add_trace(go.Scatter(
    x=list(model_performances.keys()),  # Model names on x-axis
    y=[model_performances[model]['Number of Features'] for model in model_performances],  # Feature counts on y-axis
    name="Number of Features",
    yaxis='y2',
    mode='lines+markers'
))

# Layout adjustments
fig.update_layout(
    title="Comparison of Model Performances",
    xaxis=dict(title='Model'),
    yaxis=dict(
        title='Accuracy (%)',
        range=[0, 100]  # Set range for Accuracy in percentage
    ),
    yaxis2=dict(
        title='Number of Features',
        overlaying='y',
        side="right",
        showgrid=False  # Hide secondary grid
    ),
    legend=dict(
        x=1.03,  # Legend outside the graph
        y=1.3,
        xanchor="left",
        yanchor="top",
        bordercolor="Black",
        borderwidth=1
    ),
    template='plotly_white',  # Clean white background
    margin=dict(r=200)  # Adjust right margin for legend
)

# Show plot
fig.show()
