# Feature Engineering & Model Training

### Step 1: Install Necessary Libraries

In [None]:
# Install pandas and scikit-learn if not already installed
%pip install pandas scikit-learn

### Step 2: Import Libraries

In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
import json
import pandas as pd
import os
import numpy as np

### Step 3: Get for Processed JSON file

In [None]:
selected_file_path = None  # Global to store the confirmed file path

def select_file():
    global selected_file_path
    file_path = filedialog.askopenfilename(
        title="Select a file",
        filetypes=(("JSON Files", "*.json"), ("All Files", "*.*"))
    )
    
    if file_path:
        selected_file_path = file_path
        label.config(text=file_path)
        print(f"Selected File: {file_path}")

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = json.load(file)
                text_widget.delete(1.0, tk.END)
                text_widget.insert(tk.END, json.dumps(content, indent=4))
        except Exception as e:
            text_widget.delete(1.0, tk.END)
            text_widget.insert(tk.END, f"Error reading file: {e}")

        button_confirm.pack(pady=10)
    else:
        label.config(text="No file selected")
        button_confirm.pack_forget()

def confirm_file():
    window.destroy()  # Close GUI

# --- GUI SETUP ---
window = tk.Tk()
window.title("File Selector")
window.geometry("700x400+10+20")

label = tk.Label(window, text="No file selected", width=80)
label.pack(pady=20)

button_select = tk.Button(window, text="Select File", command=select_file)
button_select.pack(pady=10)

button_confirm = tk.Button(window, text="Load File", command=confirm_file)

text_widget = tk.Text(window, width=80, height=10)
text_widget.pack(pady=10)

window.mainloop()

# --- After GUI closes ---
if selected_file_path:
    print(f"\nConfirmed file path: {selected_file_path}")
else:
    print("\nNo file selected.")

### Step 4: Load Data from Selected File

In [None]:
def load_data(file_path):
    """Loads JSON file and converts it to a DataFrame."""
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return pd.DataFrame(data)

# Load and prepare
data = load_data(selected_file_path)
data

### Step 5: Hybrid Feature-Label Extraction

#### 5.A Structured Data Feature Extraction
- Manual Selection
- Auto Selection

##### 5.A.1 Define Manual Selection GUI Class

In [None]:
import tkinter as tk
from tkinter import messagebox, ttk

class ManualFeatureLabelSelector:
    def __init__(self, columns):
        self.columns = columns
        self.selected_sets = []
        self.used_combos = set()

        self.root = tk.Tk()
        self.root.title("Manual Feature-Label Selection")
        self.root.geometry("450x550")
        self.root.resizable(False, False)

        # Instruction Label
        instruction_frame = tk.Frame(self.root)
        instruction_frame.pack(fill=tk.X, padx=20, pady=10)

        instruction = tk.Label(
            instruction_frame,
            text="Step 1: Select a label\nStep 2: Select one or more features",
            font=("Arial", 11),
            anchor="w",
            justify="left"
        )
        instruction.pack(fill=tk.X)

        # Label selection
        label_frame = tk.Frame(self.root)
        label_frame.pack(pady=5, fill=tk.X, padx=20)

        tk.Label(label_frame, text="Label:", font=("Arial", 10)).pack(anchor="w")
        self.label_var = tk.StringVar()
        self.label_dropdown = ttk.Combobox(label_frame, textvariable=self.label_var, state="readonly", font=("Arial", 10))
        self.label_dropdown['values'] = self.columns
        self.label_dropdown.bind("<<ComboboxSelected>>", self.update_feature_list)
        self.label_dropdown.pack(fill=tk.X, pady=5)

        # Feature selection
        feature_frame = tk.Frame(self.root)
        feature_frame.pack(pady=5, fill=tk.BOTH, expand=True, padx=20)

        tk.Label(feature_frame, text="Select Features:", font=("Arial", 10)).pack(anchor="w")
        self.feature_listbox = tk.Listbox(feature_frame, selectmode=tk.MULTIPLE, exportselection=False, height=12, font=("Arial", 10))
        self.feature_listbox.pack(fill=tk.BOTH, expand=True, pady=5)

        # Buttons
        button_frame = tk.Frame(self.root)
        button_frame.pack(pady=15)

        self.submit_button = tk.Button(button_frame, text="Add Combination", width=18, font=("Arial", 10), command=self.submit_selection)
        self.submit_button.pack(side=tk.LEFT, padx=10)

        self.finish_button = tk.Button(button_frame, text="Finish & Close", width=18, font=("Arial", 10), command=self.finish)
        self.finish_button.pack(side=tk.RIGHT, padx=10)

        self.root.protocol("WM_DELETE_WINDOW", self.on_close)
        self.root.mainloop()

    def update_feature_list(self, event=None):
        label = self.label_var.get()
        self.feature_listbox.delete(0, tk.END)
        for col in self.columns:
            if col != label:
                self.feature_listbox.insert(tk.END, col)

    def submit_selection(self):
        label = self.label_var.get()
        if not label:
            messagebox.showerror("Error", "Please select a label.")
            return

        selected_indices = self.feature_listbox.curselection()
        if not selected_indices:
            messagebox.showerror("Error", "Please select at least one feature.")
            return

        features = [self.feature_listbox.get(i) for i in selected_indices]
        combo_key = (label, frozenset(features))

        if combo_key in self.used_combos:
            messagebox.showwarning("Duplicate", "This combination already exists.")
        else:
            self.used_combos.add(combo_key)
            self.selected_sets.append({"label": label, "features": features})
            messagebox.showinfo("Added", f"✅ Added:\nLabel = {label}\nFeatures = {', '.join(features)}")

    def finish(self):
        if messagebox.askokcancel("Exit", "Are you sure you want to finish and close?"):
            self.root.destroy()

    def on_close(self):
        self.finish()

    def get_selected_sets(self):
        return self.selected_sets

##### 5.A.2 Define Auto Feature-Label Generation

In [None]:
from itertools import combinations

def generate_auto_combinations(columns):
    all_combinations = []
    used_combos = set()

    total_combinations = 0
    for label in columns:
        features_candidates = [col for col in columns if col != label]
        for r in range(1, len(features_candidates) + 1):
            total_combinations += len(list(combinations(features_candidates, r)))

    print(f"\n✅ Auto-generating {total_combinations} combinations.")

    processed_combinations = 0
    for label in columns:
        features_candidates = [col for col in columns if col != label]
        for r in range(1, len(features_candidates) + 1):
            for feature_set in combinations(features_candidates, r):
                combo_key = (label, frozenset(feature_set))
                if combo_key not in used_combos:
                    used_combos.add(combo_key)
                    all_combinations.append(
                        {"label": label, "features": list(feature_set)}
                    )
                    processed_combinations += 1
                    print(
                        f"[Combination Processed: {processed_combinations}/{total_combinations}] "
                        f": {{'label': '{label}', 'features': {list(feature_set)}}}"
                    )

    print(f"\n✅ Auto-generated {len(all_combinations)} combinations.")
    return all_combinations

##### 5.A.3 Prompt for Manual / Auto Selection Mode

In [None]:
def get_user_choice_window():
    choice = {"result": None}

    def set_choice(value):
        choice["result"] = value
        window.destroy()

    window = tk.Tk()
    window.title("Feature-Label Selection Mode")
    window.geometry("400x200")
    window.resizable(False, False)

    # Center message
    prompt_label = tk.Label(
        window,
        text="How do you want to select\nlabel and feature combinations?",
        justify="center"
    )
    prompt_label.pack(pady=30)

    # Button Frame
    btn_frame = tk.Frame(window)
    btn_frame.pack(pady=10)

    # Buttons
    yes_btn = tk.Button(btn_frame, text="Manual", width=12, font=("Arial", 10),
                        command=lambda: set_choice(True))
    yes_btn.pack(side=tk.LEFT, padx=15)

    no_btn = tk.Button(btn_frame, text="Auto", width=12, font=("Arial", 10),
                       command=lambda: set_choice(False))
    no_btn.pack(side=tk.RIGHT, padx=15)

    window.mainloop()
    return choice["result"]

##### 5.A.4 Define Driver Function for Hybrid Selection

In [None]:
def select_feature_label_combinations(df):
    columns = df.columns.tolist()
    if len(columns) < 2:
        raise ValueError("\n❌ Need at least two columns to form features and label.")

    user_choice = get_user_choice_window()
    if user_choice is True:
        return ManualFeatureLabelSelector(columns)
    elif user_choice is False:
        return generate_auto_combinations(columns)

##### 5.A.5 Prompt for Structured Hybrid Feature-Label Selection (Main Pipeline) 

In [None]:
if selected_file_path:
    print(f"\n📂 Loading: {selected_file_path}")
    print(f"🧠 Data Preview:\n{data.head()}\n")

    feature_label_selector = select_feature_label_combinations(data)
    if feature_label_selector:
        if isinstance(feature_label_selector, ManualFeatureLabelSelector):
            feature_label_list = feature_label_selector.get_selected_sets()
        else:
            feature_label_list = feature_label_selector

        print("\n\n📦 All selected/generated feature-label set(s):")
        for s in feature_label_list:
            print(s)
else:
    print("❌ No file path available to load.")

#### 5.B Unstructured Data Feature Extraction

In [None]:
if selected_file_path:
    print(f"\n📂 Loading: {selected_file_path}")
    print(f"🧠 Data Preview:\n{data.head()}\n")

    # Feature-label selection
    feature_label_list = [{"label": "tag", "features": ["sentence"]}]
    if feature_label_list:
        print("\n\n📦 All feature-label set(s):")
        for s in feature_label_list:
            print(s)
else:
    print("❌ No file path available to load.")


### Step 6: Split into Train/Test Sets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from collections import Counter

def split_combinations(df, selected_combinations, test_size=0.2, random_state=42):
    split_sets = []
    total = len(selected_combinations)

    # Initialize the imputer
    imputer = SimpleImputer(strategy='most_frequent')

    for idx, combo in enumerate(selected_combinations, start=1):
        label = combo['label']
        features = combo['features']

        # Step 1: Expand multi-label rows into multiple single-label rows
        expanded_rows = []
        for _, row in df.iterrows():
            label_value = row[label]
            if isinstance(label_value, list):
                for item in label_value:
                    expanded_rows.append({**{col: row[col] for col in features}, label: item})
            else:
                expanded_rows.append({**{col: row[col] for col in features}, label: label_value})

        # Step 2: Reconstruct the DataFrame
        df_expanded = pd.DataFrame(expanded_rows)

        # Step 3: Extract X and y from expanded dataframe
        X = df_expanded[features]
        y = df_expanded[label]

        # Step 4: Flatten list-type values in X (if needed)
        X = X.map(lambda x: x[0] if isinstance(x, list) else x)

        # Step 5: Keep y as original type unless it's object (category) or list
        if y.dtype == 'object' or y.map(lambda x: isinstance(x, list)).any():
            y = y.astype(str)

        # Optional debug
        print(f"\n🔍 Expanded label sample:\n{y.value_counts().head()}\n")

        # Apply imputer to fill missing values in X and y
        X_imputed = imputer.fit_transform(X)  # Impute features
        y_imputed = imputer.fit_transform(y.values.reshape(-1, 1))  # Impute labels

        # Safe stratification check
        class_counts = Counter(y)
        use_stratify = y.nunique() > 1 and min(class_counts.values()) >= 2
        if not use_stratify:
            print("⚠️ Skipping stratified split due to low class count")

        # Split into train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X_imputed, y_imputed,
            test_size=test_size,
            stratify=y if use_stratify else None,
            random_state=random_state
        )

        # Convert X_train and X_test back to DataFrames with column names
        X_train = pd.DataFrame(X_train, columns=X.columns)
        X_test = pd.DataFrame(X_test, columns=X.columns)
        y_train = pd.Series(y_train.ravel(), name=label)
        y_test = pd.Series(y_test.ravel(), name=label)

        # Append the valid split to the result list
        split_sets.append((X_train, X_test, y_train, y_test, X.columns.tolist()))

        print(f"\n[Split {idx}/{total}]\n✅ Predictor: '{label}', Features: {features}")
        print(f"➤ Train size: {len(X_train)}, Test size: {len(X_test)}\n")

    return split_sets

# Run the split function
split_sets = split_combinations(data, feature_label_list)

---

In [None]:
for idx, (X_train, X_test, y_train, y_test, features) in enumerate(split_sets):
    print(f"\n✂️ Split {idx + 1}: {y_test.name}\n")
    print("X_train:")
    print(X_train.head())
    print("\ny_train:")
    print(y_train.head())
    print("\nX_test:")
    print(X_test.head())
    print("\ny_test:")
    print(y_test.head())

### Step 7: Training Model

<p style="text-decoration: underline; font-size: 18px; font-weight: bold;">
    Evaluation Metrics by Task Type
</p>

<!-- REGRESSION METRICS -->
<p style="color: cyan; font-weight: bold; font-size: 2vw;">📈 Regression Metrics</p>

<ol>
    <li><b>R² Score (Coefficient of Determination)</b>
        <ul>
            <li>Measures how well the model explains the variability of the target variable <code>y</code>.</li>
            <li><code>R² = 1.0</code> → Perfect prediction (explains all variance).</li>
            <li><code>R² = 0</code> → Model predicts no better than the mean.</li>
            <li><code>R² &lt; 0</code> → Model performs worse than a flat line at the mean.</li>
        </ul>
    </li>
    <li>
        <b>Mean Squared Error (MSE)</b>
        <ul>
            <li>The average of the squared differences between predicted and actual values.</li>
            <li>Smaller values indicate better performance.</li>
        </ul>
    </li>
    <li>
        <b>Root Mean Squared Error (RMSE)</b>
        <ul>
            <li>The square root of MSE.</li>
            <li>Error is in the same units as the target variable.</li>
        </ul>
    </li>
    <li>
        <b>Mean Absolute Percentage Error (MAPE)</b>
        <ul>
            <li>Average percentage error between actual and predicted values.</li>
            <li><code>MAPE = 0%</code> means perfect predictions.</li>
        </ul>
    </li>
</ol>

<!-- CLASSIFICATION METRICS -->
<p style="color: pink; font-weight: bold; font-size: 2vw;">🧠 Classification Metrics</p>

<ol>
    <li><b>Accuracy Score</b>
        <ul>
            <li>Percentage of correct predictions out of total predictions.</li>
            <li><span style="color: red; font-weight: bold;">Applicable only to classification tasks.</span></li>
        </ul>
    </li>
    <li>
        <b>Precision</b>
        <ul>
            <li><code>Precision = TP / (TP + FP)</code></li>
            <li>Measures how many predicted positives are actually correct.</li>
        </ul>
    </li>
    <li>
        <b>Recall (Sensitivity)</b>
        <ul>
            <li><code>Recall = TP / (TP + FN)</code></li>
            <li>Measures how many actual positives were correctly predicted.</li>
        </ul>
    </li>
    <li>
        <b>F1 Score</b>
        <ul>
            <li><code>F1 = 2 * (Precision * Recall) / (Precision + Recall)</code></li>
            <li>Harmonic mean of precision and recall.</li>
            <li>Best when you need a balance between precision and recall, especially in imbalanced datasets.</li>
        </ul>
    </li>
    <li>
        <b>Receiver Operating Characteristic - Area Under the Curve (ROC-AUC)</b>
        <ul>
            <li>Measures model's ability to distinguish between classes.</li>
            <li>Closer to 1.0 means better class separation.</li>
            <li><span style="color: red; font-weight: bold;">Applicable only to classification tasks.</span></li>
        </ul>
    </li>
    <li>
        <b>Confidence Score</b>
        <ul>
            <li>Represents the probability or certainty of the model's prediction for each sample.</li>
            <li>Higher confidence means the model is more certain about its prediction.</li>
            <li>For classifiers like KNeighborsClassifier, this is often the proportion of neighbors that voted for the predicted class.</li>
            <li>Useful for thresholding predictions or identifying uncertain cases.</li>
        </ul>
    </li>
    <li>
        <b>Confusion Matrix</b>
        <ul>
            <li>Summarizes the model’s correct and incorrect predictions for each class.</li>
            <li><span style="color: red; font-weight: bold;">Applicable only to classification tasks.</span></li>
        </ul>
        <table border="1" cellpadding="5" style="border-collapse: collapse; margin-top: 10px;">
            <tr>
                <th></th>
                <th>Predicted: Yes</th>
                <th>Predicted: No</th>
            </tr>
            <tr>
                <th>Actual: Yes</th>
                <td>TP (True Positive)</td>
                <td>FN (False Negative)</td>
            </tr>
            <tr>
                <th>Actual: No</th>
                <td>FP (False Positive)</td>
                <td>TN (True Negative)</td>
            </tr>
        </table><br/>
    </li>
</ol>

<hr style="margin-top: 20px;">

<p><b>Summary:</b></p>
<ul>
    <li>Use <span style="color: cyan;"><b>R², MSE, RMSE, MAPE</b></span> for <b>Regression</b> problems.</li>
    <li>Use <span style="color: pink;"><b>Accuracy, Precision, Recall, F1 Score, AUC-ROC, Confidence Score, Confusion Matrix</b></span> for <b>Classification</b> problems.</li>
</ul>


#### 7.1 Generic Training Function

In [None]:
import math
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, task_type='regression', target_column='target'):
    label_encoder = LabelEncoder()
    vectorizer = None
    
    # --- Preprocessing ---
    for col in X_train.columns:
        if X_train[col].apply(lambda x: isinstance(x, list)).any():
            X_train[col] = X_train[col].apply(lambda x: str(x) if isinstance(x, list) else x)
            X_test[col] = X_test[col].apply(lambda x: str(x) if isinstance(x, list) else x)

    # Apply .str.lower() only to object columns with string values
    for col in X_train.select_dtypes(include='object').columns:
        if X_train[col].apply(lambda x: isinstance(x, str)).all():
            X_train[col] = X_train[col].str.lower()
            X_test[col] = X_test[col].str.lower()

    # Apply lambda to handle list-type targets
    y_train = y_train.apply(lambda x: x[0] if isinstance(x, list) else x)
    y_test = y_test.apply(lambda x: x[0] if isinstance(x, list) else x)

    label_encoder = None
    confidence = None
    numerical_features = None

    # --- Task-specific Handling ---
    if task_type == 'regression':
        y_train = pd.to_numeric(y_train, errors='coerce')
        y_test = pd.to_numeric(y_test, errors='coerce')

        if y_train.isna().any() or y_test.isna().any():
            print("⚠️ Warning: Some target values are NaN.")
        if y_train.isna().all() or y_test.isna().all():
            raise ValueError("🚫 All target values are NaN.")

        X_train_encoded = pd.get_dummies(X_train)
        X_test_encoded = pd.get_dummies(X_test)
        X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

        numerical_features = list(X_train.select_dtypes(exclude='object').columns)

    elif task_type == 'classification':
        label_encoder = LabelEncoder()
        y_train = label_encoder.fit_transform(y_train)

        y_test_mask = y_test.isin(label_encoder.classes_)
        X_test = X_test[y_test_mask]
        y_test = y_test[y_test_mask]

        if X_test.empty or y_test.empty:
            raise ValueError("🚫 After filtering, X_test or y_test is empty. Cannot evaluate model.")

        y_test = label_encoder.transform(y_test)

        # if it is a text classification task, then we need to vectorize the text data,
        # check on whether the Xtrain is called 'sentence' or not AND Xtest is called 'tag' or not
        # if so, we use TfidfVectorizer for text features
        if (X_train.columns[0] == 'sentence' and target_column == 'tag' and len(X_train.columns) == 1):
            vectorizer = TfidfVectorizer()
            X_train_encoded = vectorizer.fit_transform(X_train['sentence'])
            X_test_encoded = vectorizer.transform(X_test['sentence'])
        
        # otherwise we use get_dummies for categorical features
        else :
            X_train_encoded = pd.get_dummies(X_train)
            X_test_encoded = pd.get_dummies(X_test)
            X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

    else:
        raise ValueError(f"Unsupported task_type: {task_type}")

    # --- Model Training ---
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)

    # --- Metrics ---
    if task_type == 'regression':
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        epsilon = 1e-10
        nonzero_mask = y_test != 0
        mape = np.mean(np.abs((y_test[nonzero_mask] - y_pred[nonzero_mask]) / (y_test[nonzero_mask] + epsilon))) * 100 if np.any(nonzero_mask) else float('nan')

        print(f"📦 Model ({model.__class__.__name__})")
        print(f"   ➤ R² Score: {r2:.4f}")
        print(f"   ➤ MSE: {mse:.2f}")
        print(f"   ➤ RMSE: {rmse:.2f}")
        print(f"   ➤ MAPE: {mape:.2f}%")
        metrics = {"r2": r2, "mse": mse, "rmse": rmse, "mape": mape}

    elif task_type == 'classification':
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        try:
            roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_encoded), multi_class='ovr') 
        except ValueError:
            roc_auc = None  # If ROC-AUC cannot be computed for certain models
        confidence = np.max(model.predict_proba(X_test_encoded), axis=1) if hasattr(model, "predict_proba") else None
        cm = confusion_matrix(y_test, y_pred)

        print(f"📦 Model ({model.__class__.__name__})")
        print(f"   ➤ Accuracy: {acc:.2f}")
        print(f"   ➤ F1 Score: {f1:.2f}")
        print(f"   ➤ Precision: {precision:.2f}")
        print(f"   ➤ Recall: {recall:.2f}")
        print(f"   ➤ ROC-AUC: {roc_auc:.2f}" if roc_auc is not None else "   ➤ ROC-AUC: N/A")
        print(f"   ➤ Confidence: {np.mean(confidence):.2f}" if confidence is not None else "   ➤ Confidence: N/A")
        print("   ➤ Confusion Matrix:")
        print(cm)
        metrics = {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "roc_auc": roc_auc,
            "confusion_matrix": cm,
            "confidence": confidence
        }

    print("\n📦 Predictions")
    print(y_pred)

    print("\n📦 Encoded y-test")
    print(y_test)
    
    return {
        # --- Model Info ---
        "model": model,
        "description": f"{model.__class__.__name__}, predicting {target_column} based on {', '.join(X_train.columns)}",
        "predictor": target_column,

        # --- Feature Info ---
        "features": list(X_train.columns),
        "X_train_columns": vectorizer.get_feature_names_out().tolist() if isinstance(X_train_encoded, np.ndarray) or isinstance(X_train_encoded, csr_matrix) else X_train_encoded.columns.tolist(),
        "numerical_features": numerical_features,

        # --- Encoded & Raw Data (optional for inspection or downstream use) ---
        "X_train_encoded": X_train_encoded,
        "X_test_encoded": X_test_encoded,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,

        # --- Predictions ---
        "y_pred": y_pred,
        "confidence": confidence,

        # --- Label Handling ---
        "label_encoder": label_encoder,

        # --- Evaluation Metrics ---
        "metrics": metrics,
        
        # --- Return the vectorizer for text classification ---
        "vectorizer": vectorizer if task_type == 'classification' else None
    }

#### 7.2 Model Selection GUI Definition

<table>
<tr>
<td colspan="4" style="font-weight: bold; text-align: center; font-size: 2vw; line-height: 0.15vh">Model Selection Justification</td>
</tr>
<tr>
<td style="font-weight: bold; text-align: center;">Name</td>
<td style="font-weight: bold; text-align: center;">Description</td>
<td style="font-weight: bold; text-align: center;">Example of Usage</td>
<td style="font-weight: bold; text-align: center;">Supervised/Unsupervised</td>
</tr>
<tr>
<td style="text-align: center;">Linear Regression</td>
<td style="text-align: justify;">Used to predict numbers based on other factors.</td>
<td style="text-align: justify;">For example, predicting the "Minimum Cost per Day" based on things like District, Vehicle, Travel Time, and Season.</td>
<td style="text-align: center;">Supervised</td>
</tr>
<tr>
<td style="text-align: center;">Decision Tree</td>
<td style="text-align: justify;">A model that makes decisions by asking yes/no questions about the data.</td>
<td style="text-align: justify;">If the District is "Khagrachhari" and the Tourist Spots include "Risang Jhorna", then the Season will be "Winter".</td>
<td style="text-align: center;">Supervised</td>
</tr>
<tr>
<td style="text-align: center;">K-Nearest Neighbors (KNN)</td>
<td style="text-align: justify;">Finds the most similar examples from past data and uses them to make predictions.</td>
<td style="text-align: justify;">To recommend tourist spots that are similar to "Risang Jhorna", based on nearby options in the data.</td>
<td style="text-align: center;">Supervised</td>
</tr>
<tr>
<td style="text-align: center;">Random Forest</td>
<td style="text-align: justify;">Uses many decision trees together to make a better prediction.</td>
<td style="text-align: justify;">For example, one tree might say "Visit Risang Jhorna" because the District is "Khagrachhari" and the Season is "Winter", while another tree might say "Visit Risang Jhorna" based on different factors like the Vehicle being "Car".</td>
<td style="text-align: center;">Supervised</td>
</tr>
</table>

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import is_classifier
from sklearn.base import is_regressor
from math import sqrt

def get_model(model_name, X_train=None):
    """
    Returns a machine learning model instance based on the model name.
    If KNN is selected, number of neighbors is determined by sqrt of training samples.
    
    Parameters:
    - model_name (str): Name of the model.
    - X_train (optional): Training features, needed only for KNN to calculate optimal k.
    
    Returns:
    - model: Instantiated ML model
    """
    if model_name == "Linear Regression":
        return LinearRegression()
    elif model_name == "Decision Tree Regressor":
        return DecisionTreeRegressor()
    elif model_name == "Decision Tree Classifier":
        return DecisionTreeClassifier()
    elif model_name == "K-Nearest Neighbors Regressor":
        k = max(1, int(sqrt(len(X_train)))) if X_train is not None else 5
        print(f"Using {k} neighbors for KNN")
        return KNeighborsRegressor(n_neighbors=k)
    elif model_name == "K-Nearest Neighbors Classifier":
        k = max(1, int(sqrt(len(X_train)))) if X_train is not None else 5
        print(f"Using {k} neighbors for KNN")
        return KNeighborsClassifier(n_neighbors=k)
    elif model_name == "Random Forest Regressor":
        return RandomForestRegressor()
    elif model_name == "Random Forest Classifier":
        return RandomForestClassifier()
    else:
        raise ValueError(f"🚫 Unknown model name: {model_name}")

def select_model_gui(features, label, feature_names=None, label_name=None):
    """
    Tkinter GUI to select model based on label type.
    Displays selected features and label for clarity.
    """

    selected_model = None

    def select_model(model):
        nonlocal selected_model
        selected_model = model
        print(f"\n✅ Selected model: {model}")
        window.quit()
        window.destroy()

    # --- Label type analysis ---
    label_is_categorical = label.dtype == 'object' or label.dtype.name == 'category'
    try:
        _ = pd.to_numeric(label, errors='coerce')
        label_is_numeric = not label_is_categorical and label.dropna().apply(lambda x: isinstance(x, (int, float))).all()
    except:
        label_is_numeric = False

    # --- GUI setup ---
    window = tk.Tk()
    window.title("Model Selector")
    window.geometry("650x600+50+50")

    tk.Label(window, text="🤖 Click to Select a Machine Learning Model", font=("Arial", 14, "bold")).pack(pady=10)

    # --- Display selected features and label ---
    if isinstance(features, list):
        feature_text = ", ".join(features)
    else:
        feature_text = ", ".join(features.columns) if features is not None else ", ".join(feature_names)

    label_text = label_name if label_name else label.name if label.name else "Label"

    tk.Label(window, text=f"🎯 Label: {label_text}", font=("Arial", 11), wraplength=600, justify="left", fg="dark green").pack(pady=5)
    tk.Label(window, text=f"📌 Features: {feature_text}", font=("Arial", 11), wraplength=600, justify="left", fg="blue").pack(pady=10)

    # --- Model options ---
    if label_is_categorical:
        models = [
            "Decision Tree Classifier",
            "K-Nearest Neighbors Classifier",
            "Random Forest Classifier"
        ]
    else:
        models = [
            "Linear Regression",
            "Decision Tree Regressor",
            "K-Nearest Neighbors Regressor",
            "Random Forest Regressor"
        ]

    for model_name in models:
        button = tk.Button(window, text=model_name, width=45, pady=10,
                           command=lambda name=model_name: select_model(name))
        button.pack(pady=5)

    window.mainloop()
    return selected_model


#### 7.3 Model Selection & Training (Main Operation)

In [None]:
# Assuming split_sets and feature_label_list are already defined
trained_models = []

for idx, (X_train, X_test, y_train, y_test, feature_names) in enumerate(split_sets, start=1):
    print(f"\n🔄 Training split {idx}/{len(split_sets)}")

    # Flatten list-type targets if present
    y_train = y_train.apply(lambda x: x[0] if isinstance(x, list) else x)
    y_test = y_test.apply(lambda x: x[0] if isinstance(x, list) else x)

    # If data is still numpy arrays, convert to DataFrame
    if isinstance(X_train, np.ndarray):
        X_train = pd.DataFrame(X_train, columns=feature_names)
        X_test = pd.DataFrame(X_test, columns=feature_names)

    # Try converting to numeric to check if it's suitable for regression
    y_train_numeric = pd.to_numeric(y_train, errors='coerce')
    y_test_numeric = pd.to_numeric(y_test, errors='coerce')

    label_is_numeric = not y_train_numeric.isna().all()

    # Choose the appropriate label for GUI display and training
    label_for_gui = y_train_numeric if label_is_numeric else y_train

    # Display model selection GUI
    selected_model_name = select_model_gui(features=X_train.columns.tolist(), label=label_for_gui)

    if selected_model_name:
        model = get_model(selected_model_name, X_train)        
        task_type = "classification" if is_classifier(model) else "regression"
        result = train_and_evaluate_model(
            model,
            X_train,
            X_test,
            y_train_numeric if task_type == "regression" else y_train,
            y_test_numeric if task_type == "regression" else y_test,
            task_type=task_type,
            target_column=y_train.name
        )
        trained_models.append(result)
        print(f"\n✅ Model {selected_model_name} trained and evaluated successfully!\n")
    else:
        print("\n🚫 No model was selected for this split.")
        
print(y_train.value_counts())

#### 7.4 Visualization of Models

In [None]:
%pip install matplotlib seaborn wordcloud

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import numpy as np
import re
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd

def visualize_model_results(trained_models):
    for result in trained_models:
        model = result["model"]
        y_test = result["y_test"]
        y_pred = result["y_pred"]
        
        title = result.get("predictor", "Unknown Model")
        label_encoder = result.get("label_encoder", None)

        is_classifier = hasattr(model, "predict_proba") or hasattr(model, "classes_")
        is_regressor = not is_classifier

        is_text_classification = False
        if is_classifier and isinstance(result.get("features"), list):
            features = result["features"]
            if len(features) == 1 and features[0].lower() == "sentence":
                feature_name = "sentence"
                label_name = "tag"
                is_text_classification = True

        if is_classifier:
            if is_text_classification:
                print(f"Detected text classification task (feature='{feature_name}', label='{label_name}') - Generating WordCloud...")

                # Decode predicted labels
                if label_encoder is not None:
                    y_pred_decoded = label_encoder.inverse_transform(y_pred)
                else:
                    y_pred_decoded = y_pred

                pred_counter = Counter(y_pred_decoded)

                # WordCloud from input sentence content
                X_test = result.get("X_test")

                if isinstance(X_test, pd.DataFrame) and "sentence" in X_test.columns:
                    sentences = X_test["sentence"].astype(str).tolist()
                else:
                    features = result.get("features_matrix")
                    if features is not None:
                        sentences = [" ".join(map(str, row)) for row in features]
                    else:
                        sentences = []

                text_blob = " ".join(sentences)
                words = re.findall(r'\b[a-zA-Z]{3,}\b', text_blob.lower())
                filtered_words = [word for word in words if word not in ENGLISH_STOP_WORDS]
                word_freq = Counter(filtered_words)

                wordcloud = WordCloud(width=800, height=400, background_color='white',
                                        colormap='viridis', max_words=100).generate_from_frequencies(word_freq)

                plt.figure(figsize=(12, 6))
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis('off')
                plt.title(f"WordCloud of Input Sentences - {title}")
                plt.tight_layout()
                plt.show()

                # Bar plot: overall predicted tag distribution
                tag_dist = Counter(y_pred_decoded)
                tag_df = pd.DataFrame(tag_dist.items(), columns=["Tag", "Count"]).sort_values(by="Count", ascending=False)

                plt.figure(figsize=(10, 5))
                sns.barplot(data=tag_df, x="Tag", y="Count", palette="crest")
                plt.title(f"Distribution of Predicted Tags - {title}")
                plt.xlabel("Tag")
                plt.ylabel("Count")
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout()
                plt.show()

            else:
                # Structured Classification -> Confusion Matrix
                cm = confusion_matrix(y_test, y_pred)

                if label_encoder is not None:
                    class_labels = label_encoder.inverse_transform(np.arange(len(label_encoder.classes_)))
                else:
                    class_labels = [str(i) for i in range(cm.shape[0])]

                plt.figure(figsize=(14, 10))
                sns.heatmap(cm, xticklabels=class_labels, yticklabels=class_labels,
                            annot=True, fmt='d', cmap="YlGnBu", cbar=False)
                plt.xlabel('Predicted Label')
                plt.ylabel('True Label')
                plt.title(f'Confusion Matrix - {title} Classifier')
                plt.xticks(rotation=45, ha='right')
                plt.yticks(rotation=0)
                plt.tight_layout()
                plt.show()

        elif is_regressor:
            residuals = np.array(y_test) - np.array(y_pred)

            # Predicted vs Actual
            plt.figure(figsize=(10, 6))
            sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
            plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', label='Perfect Prediction')
            plt.xlabel("True Values")
            plt.ylabel("Predicted Values")
            plt.title(f"Predicted vs Actual - {title} Regressor")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()

            # Residual Plot
            plt.figure(figsize=(10, 6))
            sns.scatterplot(x=y_pred, y=residuals, alpha=0.6)
            plt.axhline(0, color='red', linestyle='--')
            plt.xlabel("Predicted Values")
            plt.ylabel("Residuals (True - Predicted)")
            plt.title(f"Residual Plot - {title} Regressor")
            plt.grid(True)
            plt.tight_layout()
            plt.show()

            # Error Distribution
            plt.figure(figsize=(10, 6))
            sns.histplot(residuals, kde=True, bins=30)
            plt.title(f"Distribution of Errors - {title} Regressor")
            plt.xlabel("Prediction Error")
            plt.grid(True)
            plt.tight_layout()
            plt.show()

visualize_model_results(trained_models)

#### 7.5 Prediction on Unseen Data

In [None]:
from sklearn.base import ClassifierMixin, RegressorMixin

# --- Prediction Function --- 
def predict_unseen_data(model_index, input_data):
    model_info = trained_models[model_index]
    model = model_info["model"]
    original_features = model_info["features"]
    X_train_encoded_columns = model_info["X_train_columns"]
    
    # Get the encoded features from the model
    encoded_features = model_info["X_train_encoded"]

    try:
        input_df = pd.DataFrame([input_data], columns=original_features)

        # Handle categorical encoding
        categorical_features = [col for col in input_df.columns if input_df[col].dtype == object]
        if categorical_features:
            if (len(categorical_features) == 1 and categorical_features[0] == "sentence" and model_info.get("predictor") == "tag"):
                # If the input is a text classification task, we need to vectorize the text data
                vectorizer = model_info.get("vectorizer")  # Retrieve the fitted vectorizer from the model
                if vectorizer is None:
                    raise ValueError("TF-IDF Vectorizer is not fitted or saved during training.")
                input_encoded = vectorizer.transform(input_df["sentence"]).toarray()  # Convert to dense array
                input_df_encoded = pd.DataFrame(input_encoded, columns=X_train_encoded_columns)
            else:
                input_encoded = pd.get_dummies(input_df, columns=categorical_features)
                input_df_encoded = input_encoded.copy()
        else:
            input_df_encoded = input_df.copy()
            
        input_aligned = input_df_encoded.reindex(columns=X_train_encoded_columns, fill_value=0)

        if input_aligned.isnull().values.any() or input_aligned.empty:
            raise ValueError("Input data is empty after preprocessing. Please check the input fields.")

        prediction = model.predict(input_aligned)[0]

        confidence = None
        label = None

        # --- Handle Classifiers ---
        if isinstance(model, ClassifierMixin):
            if hasattr(model, "predict_proba"):
                proba = model.predict_proba(input_aligned)[0]
                predicted_class_idx = model.classes_.tolist().index(prediction)
                confidence = proba[predicted_class_idx]
            elif hasattr(model, "kneighbors"):
                distances, _ = model.kneighbors(input_aligned)
                max_distance = np.max(distances)
                confidence = 1.0 - (max_distance / (max_distance + 1e-5))
            else:
                confidence = 0.0  # fallback

            label_encoder = model_info.get("label_encoder")
            if label_encoder:
                label = label_encoder.inverse_transform([prediction])[0]
            else:
                label = str(prediction)

        # --- Handle Regressors ---
        elif isinstance(model, RegressorMixin):
            label = float(prediction)

            # Calculate a pseudo-confidence based on training target spread
            y_train = model_info.get("y_train")
            if y_train is not None and len(y_train) > 1:
                y_min, y_max = np.min(y_train), np.max(y_train)
                if y_max != y_min:
                    normalized_prediction = (prediction - y_min) / (y_max - y_min)
                    confidence = 1.0 - abs(0.5 - normalized_prediction) * 2  # closer to center is more confident
                    confidence = max(0.0, min(1.0, confidence))
                else:
                    confidence = 0.0
            else:
                confidence = 0.0

        else:
            raise ValueError("Unsupported model type.")

        confidence_percentage = round(confidence * 100, 2)
        return label, confidence_percentage

    except Exception as e:
        error_msg = f"❌ Error while predicting:\n{e}"
        print(error_msg)
        messagebox.showerror("Prediction Error", error_msg)
        return None, None


    except Exception as e:
        error_msg = f"❌ Error while predicting:\n{e}"
        print(error_msg)
        messagebox.showerror("Prediction Error", error_msg)
        return None, None

# --- Tkinter GUI Functions ---
def update_fields_for_model(model_index):
    try:
        for widget in feature_frame.winfo_children():
            widget.destroy()
        input_entries.clear()

        model_info = trained_models[model_index]
        model_predictor = model_info["predictor"]
        model_features = model_info["features"]

        model_info_label.config(
            text=f"Model to Predict: {model_predictor} (based on {', '.join(model_features)})"
        )

        for feature in model_features:
            tk.Label(feature_frame, text=feature).pack(anchor='w')
            entry = tk.Entry(feature_frame, width=30)
            entry.pack(anchor='w')
            input_entries.append(entry)

        model_var.set(str(model_index))

    except Exception as e:
        messagebox.showerror("Error", f"An error occurred while updating fields:\n{e}")

def predict():
    try:
        model_index = int(model_var.get())
        input_data = {}

        for feature, entry in zip(trained_models[model_index]["features"], input_entries):
            value = entry.get().strip()
            if value == '':
                messagebox.showwarning("Missing Input", "Please fill all feature fields before predicting.")
                return
            try:
                if '.' in value:
                    input_data[feature] = float(value)
                else:
                    input_data[feature] = int(value)
            except ValueError:
                input_data[feature] = value.lower()

        prediction, confidence = predict_unseen_data(model_index, input_data)

        output_text.delete("1.0", tk.END)
        if prediction is not None:
            result_msg = f"✅ Prediction: {prediction}\n"
            result_msg += f"🔍 Confidence: {confidence}%"
            output_text.insert(tk.END, result_msg)
        else:
            output_text.insert(tk.END, "❌ Prediction failed. See error messages.")

    except Exception as e:
        messagebox.showerror("Input Error", f"Invalid input:\n{e}")

def predict_with_unseen_input_gui():
    global model_var, feature_frame, input_entries, output_text, model_info_label

    window = tk.Tk()
    window.title("Predict Unseen Data")

    def on_closing():
        window.quit()
        window.destroy()

    window.protocol("WM_DELETE_WINDOW", on_closing)

    model_var = tk.StringVar()

    model_info_label = tk.Label(window, text="", font=("Arial", 10, "bold"), wraplength=400, justify="left")
    model_info_label.pack(pady=5)

    model_buttons_frame = tk.Frame(window)
    model_buttons_frame.pack(pady=5)

    for index, model_info in enumerate(trained_models):
        model_name = model_info["model"].__class__.__name__
        model_predictor = model_info["predictor"]
        model_features = model_info["features"]
        button_text = f"Model: {model_name}\nPredictor: {model_predictor}\nFeatures: {', '.join(model_features)}"

        tk.Button(
            model_buttons_frame,
            text=button_text,
            command=lambda idx=index: update_fields_for_model(idx),
            width=40,
            height=5,
            anchor="w",
            justify="left",
            wraplength=350
        ).pack(pady=3)

    feature_frame = tk.Frame(window)
    feature_frame.pack(pady=10)

    input_entries = []

    tk.Button(window, text="🚀 Predict", command=predict).pack(pady=5)

    output_text = tk.Text(window, height=6, width=60)
    output_text.pack(pady=10)

    window.mainloop()

# 🚀 Launch GUI
predict_with_unseen_input_gui()

### Step 8: Export Trained Model(s)

In [None]:
import joblib

# File path
model_file = "models.pkl"

all_models = []

# Check if models.pkl exists
if os.path.exists(model_file):
    # Load existing models
    all_models = joblib.load(model_file)

# Extend the list properly
all_models.extend(trained_models)

# Save the updated models list back to models.pkl
joblib.dump(all_models, model_file)

print(f"✅ Models saved successfully! Total models now: {len(all_models)}")

(Optional) : Load trained models from `models.pkl`

In [None]:
import joblib
import os

# File path
model_file = "models.pkl"

# Check if models.pkl exists
if os.path.exists(model_file):
    # Load existing models
    
    # should show: <class 'list'>
    existing_models = joblib.load(model_file)
    print(type(existing_models)) 

    # should show: <class 'dict'>
    print(type(existing_models[0]))
    
    print(f"Number of Models : {len(existing_models)}")

### - END