In [None]:
# 📌 Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

In [None]:
# 📌 Load the Dataset
file_path = "C:/Users/Dell/Desktop/classificationml/dataset_excavate.xlsx - Sheet 1.csv"
data = pd.read_csv(file_path)

In [None]:
# 📌 Step 1: Data Cleaning - Remove missing values
data = data.dropna()

In [None]:
# 📌 Step 2: Filter Only Insulators (Eg > 0.5 eV)
data = data[data["PBE band gap"] > 0.5]  # Keep only insulators

In [None]:
# 📌 Step 3: Encode Categorical Features
categorical_columns = ["functional group", "A", "A'", "Bi", "B'"]
label_encoders = {}

In [None]:
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoders for later use

In [None]:
# 📌 Step 4: Define Inputs (X) & Output (Y) for Regression
X = data.drop(columns=["PBE band gap"])  # Features
y = data["PBE band gap"]  # Target variable (Band Gap in eV)

In [None]:
# 📌 Step 5: Split Data into Training (80%) & Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 📌 Step 6: Normalize Numerical Features
scaler = StandardScaler()
numerical_columns = [col for col in X.columns if col not in categorical_columns]

In [None]:
# Apply StandardScaler
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [None]:
# 📌 Step 7: Hyperparameter Tuning for XGBRegressor
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

In [None]:
random_search = RandomizedSearchCV(XGBRegressor(random_state=42),
                                   param_distributions=param_grid,
                                   n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
random_search.fit(X_train, y_train)

In [None]:
# 📌 Step 8: Train the Best Model
regressor = random_search.best_estimator_
print("🔹 Best Hyperparameters:", random_search.best_params_)

In [None]:
# 📌 Step 9: Make Predictions
y_pred = regressor.predict(X_test)

In [None]:
# 📌 Step 10: Evaluate Model Performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"\n🔹 Model Mean Squared Error (MSE): {mse:.4f}")
print(f"🔹 Model R² Score: {r2:.4f} (Higher is better, max = 1)")

In [None]:
# 📌 Step 11: Scatter Plot of Actual vs. Predicted Values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color="blue", alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--")  # Perfect predictions line
plt.xlabel("Actual Band Gap (eV)")
plt.ylabel("Predicted Band Gap (eV)")
plt.title("Actual vs. Predicted Band Gap")
plt.show()

In [None]:
# ------------------------ USER INPUT REGRESSION ------------------------ #
def predict_band_gap():
    print("\n🔹 Enter Material Properties to Predict Band Gap (Eg in eV):")

    user_data = {}

    # Get input for categorical features
    for col in categorical_columns:
        user_value = input(f"Enter value for {col}: ").strip().title()  # Convert to Title Case

        if user_value in label_encoders[col].classes_:
            user_data[col] = label_encoders[col].transform([user_value])[0]
        else:
            print(f"⚠️ Warning: {user_value} is not in the dataset. Assigning most common category.")
            common_category = data[col].mode()[0]

            # 🔥 Fix: Add the common category to label encoder
            if common_category not in label_encoders[col].classes_:
                label_encoders[col].classes_ = np.append(label_encoders[col].classes_, common_category)

            user_data[col] = label_encoders[col].transform([common_category])[0]

    # Get input for numerical features
    for col in numerical_columns:
        user_data[col] = float(input(f"Enter value for {col}: "))

    # Convert user input to DataFrame
    user_df = pd.DataFrame([user_data])

    # Ensure column order matches training data
    user_df = user_df.reindex(columns=X.columns, fill_value=0)

    # Normalize numerical values
    user_df[numerical_columns] = scaler.transform(user_df[numerical_columns])

    # Predict
    prediction = regressor.predict(user_df)[0]
    print(f"\n🔹 Predicted Band Gap (Eg): **{prediction:.4f} eV**")

In [None]:
# Run user input function
predict_band_gap()