In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/content/dataset.csv'  # Adjust as needed
data = pd.read_csv(file_path)

# Drop non-numeric columns and separate features/target
def safe_convert_to_numeric(series):
    return pd.to_numeric(series, errors='coerce')  # Convert invalid values to NaN

X = data.drop(columns=['HEA combinations', 'Phases', 'Unnamed: 20'], errors='ignore')
X = X.apply(safe_convert_to_numeric)
y = data['Phases']

# Encode target variable
y_mapped, unique_labels = pd.factorize(y)

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_mapped, test_size=0.2, random_state=42)

# Define hyperparameter grid for RF
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [5, 10, None],      # Tree depth (None means no limit)
    'min_samples_split': [2, 5, 10], # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],   # Minimum samples at a leaf node
    'bootstrap': [True, False]       # Bootstrap sampling for trees
}

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_rf_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Make predictions on the test set
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=unique_labels, yticklabels=unique_labels)
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted Phases')
plt.ylabel('True Phases')
plt.show()

# Print accuracy and classification report
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=unique_labels))
