In [2]:
import pandas as pd
import os

# Load the file (works for both Excel and CSV)
file_path = 'anxiety_dataset.xlsx'  # Replace with your file path
# For Excel
if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
    df = pd.read_excel(file_path)
# For CSV
elif file_path.endswith('.csv'):
    df = pd.read_csv(file_path)
else:
    raise ValueError("File format not supported. Please use Excel (.xlsx, .xls) or CSV (.csv).")

# Print original columns and shape
print("Original columns:", df.columns.tolist())
print("Original shape:", df.shape)

# Create a list of columns to remove
columns_to_remove = []

# Check if 'Unnamed: 0' column exists
if 'Unnamed: 0' in df.columns:
    columns_to_remove.append('Unnamed: 0')

# Check if 'text' column exists
if 'text' in df.columns:
    columns_to_remove.append('text')

# Remove the specified columns if they exist
if columns_to_remove:
    df = df.drop(columns=columns_to_remove)
    print(f"Removed columns: {columns_to_remove}")
else:
    print("None of the specified columns found in the dataset.")

# Print remaining columns and shape
print("Remaining columns:", df.columns.tolist())
print("New shape:", df.shape)

# Display the first few rows to verify
print("\nFirst 5 rows after removing columns:")
print(df.head())

# Save the updated dataset
output_file = 'updated_' + os.path.basename(file_path)
if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
    df.to_excel(output_file, index=False)
else:  # CSV
    df.to_csv(output_file, index=False)

print(f"\nUpdated dataset saved to {output_file}")

Original columns: ['Unnamed: 0', 'text', 'anxiety', 'all_labels', 'levels', 'Urdu_text']
Original shape: (12000, 6)
Removed columns: ['Unnamed: 0', 'text']
Remaining columns: ['anxiety', 'all_labels', 'levels', 'Urdu_text']
New shape: (12000, 4)

First 5 rows after removing columns:
   anxiety all_labels      levels  \
0        0     Normal  No Anxiety   
1        0     Normal  No Anxiety   
2        0     Normal  No Anxiety   
3        0     Normal  No Anxiety   
4        0     Normal  No Anxiety   

                                           Urdu_text  
0           مجھے آپ کا ٹویٹر نہیں ملا، جواب خراب ہے۔  
1  سب کو شب بخیر اور جیرڈ، کبھی بھی کائٹ پف میں چ...  
2  ایپل میوزک فروخت ہو رہا ہے۔ 3 ماہ کی ایکٹیویشن...  
3  تقریب ابھی تک شروع نہیں ہوئی۔ محترمہ، صرف ذکر ...  
4             میں واقعی امید کرتا ہوں کہ سردی نہ ہو۔  

Updated dataset saved to updated_anxiety_dataset.xlsx


**updated code**







 code without hyperparameter tunning


In [None]:
"""
Urdu Anxiety Level Prediction Model - Training and Prediction
-------------------------------------------------------------
This script performs the essential steps to train and test an anxiety prediction model:
1. Data preprocessing and cleaning for Urdu text
2. Feature extraction using TF-IDF
3. Model training and evaluation
4. User input prediction for anxiety, all_labels, and levels
"""

import re
import csv
import pandas as pd
import numpy as np
import joblib
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datetime import datetime

# NLP and ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb

# ===================================================================================================
# PART 1: DATA LOADING AND EXPLORATION
# ===================================================================================================
def load_data(filepath):
    """
    Load and explore the dataset
    """
    print("Starting to load data from:", filepath)

    try:
        # Load data - Excel doesn't need the encoding parameter
        df = pd.read_excel(filepath)

        # Basic dataset information
        print(f"Dataset shape: {df.shape}")
        print("\nColumn information:")
        print(df.info())

        # Check for missing values
        print("\nMissing values:")
        print(df.isnull().sum())

        # Target variable distribution
        print("\nDistribution of anxiety labels:")
        print(df['anxiety'].value_counts())
        print("\nDistribution of all_labels:")
        print(df['all_labels'].value_counts())
        print("\nDistribution of levels:")
        print(df['levels'].value_counts())

        return df

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        print("Trying with engine='openpyxl'...")
        try:
            # Try with openpyxl engine
            df = pd.read_excel(filepath, engine='openpyxl')
            print("Successfully loaded with openpyxl engine!")

            # Basic dataset information
            print(f"Dataset shape: {df.shape}")
            print("\nColumn information:")
            print(df.info())

            # Check for missing values
            print("\nMissing values:")
            print(df.isnull().sum())

            # Target variable distribution
            print("\nDistribution of anxiety labels:")
            print(df['anxiety'].value_counts())
            print("\nDistribution of all_labels:")
            print(df['all_labels'].value_counts())
            print("\nDistribution of levels:")
            print(df['levels'].value_counts())

            return df

        except Exception as e2:
            print(f"Error loading with openpyxl: {str(e2)}")
            print("Please check your file path and ensure the file is a valid Excel file.")
            raise

# ===================================================================================================
# PART 2: TEXT PREPROCESSING FOR URDU
# ===================================================================================================
class UrduTextPreprocessor:
    def __init__(self):
        # Common Urdu stopwords
        self.urdu_stopwords = set([
            'کے', 'کا', 'کی', 'میں', 'سے', 'اور', 'ہے', 'کو', 'نے', 'پر', 'ہیں', 'کہ',
            'تھا', 'تھی', 'تھے', 'ہوں', 'ہوا', 'ہوئی', 'ہوئے', 'تو', 'اس', 'وہ', 'ان',
            'تھیں', 'یہ', 'رہا', 'رہی', 'رہے', 'ہوتا', 'ہوتی', 'ہوتے', 'گا', 'گی', 'گے',
            'کر', 'کرے', 'کرتا', 'کرتی', 'کرتے', 'کیا', 'ہوگا', 'ہوگی', 'ہوگے', 'ہوگیا',
            'ہوگئی', 'ہوگئے', 'ہوگئیں', 'بھی', 'جو', 'لیے', 'بہت', 'پھر', 'گیا', 'گئی',
            'گئے', 'گئیں', 'وغیرہ', 'والا', 'والی', 'والے', 'مگر', 'لیکن', 'جب', 'تب',
            'اب', 'اگر', 'تاکہ', 'جبکہ'
        ])

        # Basic normalization mapping for chat/slang
        self.normalization_map = {
            'kya': 'کیا',
            'hai': 'ہے',
            'nhi': 'نہیں',
            'mjy': 'مجھے',
            'ap': 'آپ',
            'aap': 'آپ',
            'tha': 'تھا',
            'ho': 'ہو',
            'yr': 'یار',
            'maine': 'میں نے',
            'mene': 'میں نے',
            # Add more as needed
        }

        # Simple lemmatization mapping (just a few examples, would need a proper Urdu lemmatizer)
        self.lemma_map = {
            'چلی': 'چل',
            'چلیں': 'چل',
            'چلا': 'چل',
            'جاتا': 'جا',
            'جاتی': 'جا',
            'جاتے': 'جا',
            'کرتا': 'کر',
            'کرتی': 'کر',
            'کرتے': 'کر',
            'کرنا': 'کر',
            'کرنی': 'کر',
            'کرنے': 'کر',
            # Add more as needed
        }

    def remove_urls(self, text):
        """Remove URLs from text"""
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)

    def remove_emojis(self, text):
        """Remove emojis from text"""
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F700-\U0001F77F"  # alchemical symbols
            u"\U0001F780-\U0001F7FF"  # Geometric Shapes
            u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            u"\U0001FA00-\U0001FA6F"  # Chess Symbols
            u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            u"\U00002702-\U000027B0"  # Dingbats
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE
        )
        return emoji_pattern.sub(r'', text)

    def remove_punctuation_and_numbers(self, text):
        """Remove punctuation and numbers"""
        # Remove Latin and Arabic/Urdu punctuation and numbers
        text = re.sub(r'[.,;:!\'\"()[\]{}@#$%^&*+_=|<>/?\d]+', ' ', text)
        text = re.sub(r'[،؛؟٬٫٪؍؎''‛""„‟]+', ' ', text)  # Urdu/Arabic punctuation
        text = re.sub(r'[۰-۹]+', ' ', text)  # Urdu/Persian numbers
        text = re.sub(r'[٠-٩]+', ' ', text)  # Arabic numbers
        return text

    def tokenize(self, text):
        """Tokenize Urdu text into words"""
        # Simple space-based tokenization, can be enhanced with a proper Urdu tokenizer
        return text.split()

    def normalize_text(self, text):
        """Normalize chat/slang Urdu"""
        words = text.split()
        normalized = []
        for word in words:
            word_lower = word.lower()
            if word_lower in self.normalization_map:
                normalized.append(self.normalization_map[word_lower])
            else:
                normalized.append(word)
        return ' '.join(normalized)

    def remove_stopwords(self, text):
        """Remove Urdu stopwords"""
        return ' '.join([word for word in text.split() if word not in self.urdu_stopwords])

    def lemmatize(self, text):
        """Simple lemmatization for Urdu based on mapping"""
        words = text.split()
        lemmatized = []
        for word in words:
            if word in self.lemma_map:
                lemmatized.append(self.lemma_map[word])
            else:
                lemmatized.append(word)
        return ' '.join(lemmatized)

    def preprocess(self, text):
        """Complete preprocessing pipeline for Urdu text"""
        if pd.isna(text):
            return ""

        text = str(text)
        text = self.remove_urls(text)
        text = self.remove_emojis(text)
        text = self.remove_punctuation_and_numbers(text)
        text = self.normalize_text(text)
        text = self.remove_stopwords(text)
        text = self.lemmatize(text)
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text

# ===================================================================================================
# PART 3: FEATURE ENGINEERING AND DATA PREPARATION
# ===================================================================================================
def prepare_data(df, preprocessor):
    """
    Prepare data for modeling:
    1. Preprocess text
    2. Encode labels
    3. Split data
    """
    print("Preparing data for modeling...")

    # Preprocess Urdu text
    print("Preprocessing Urdu text...")
    df['cleaned_text'] = df['Urdu_text'].apply(preprocessor.preprocess)

    # Check if we have empty strings after preprocessing
    empty_rows = df[df['cleaned_text'] == ''].shape[0]
    print(f"Rows with empty text after preprocessing: {empty_rows}")

    # Remove rows with empty text if any
    if empty_rows > 0:
        df = df[df['cleaned_text'] != ''].reset_index(drop=True)
        print(f"Removed {empty_rows} rows with empty text.")

    # Create TF-IDF features
    print("Creating TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer(
        min_df=2,             # Minimum document frequency
        max_df=0.95,          # Maximum document frequency
        ngram_range=(1, 2),   # Use unigrams and bigrams
        max_features=5000     # Limit features to 5000
    )
    X = tfidf_vectorizer.fit_transform(df['cleaned_text'])
    print(f"TF-IDF features shape: {X.shape}")

    # Encode labels
    print("Encoding labels...")
    # For binary classification (anxiety)
    y_binary = df['anxiety'].values

    # For multi-class classification (all_labels and levels)
    print("Checking class frequencies in all_labels...")
    all_labels_counts = df['all_labels'].value_counts()
    print(all_labels_counts)

    # Filter out rare classes with only 1 instance for all_labels
    rare_classes = all_labels_counts[all_labels_counts == 1].index.tolist()
    if rare_classes:
        print(f"Found rare classes with only 1 instance: {rare_classes}")
        print("Filtering out rare classes for all_labels before encoding")
        df_filtered_all_labels = df[~df['all_labels'].isin(rare_classes)].copy()
        print(f"Removed {len(df) - len(df_filtered_all_labels)} rows with rare classes")
    else:
        df_filtered_all_labels = df.copy()

    label_encoder_all_labels = LabelEncoder()
    y_all_labels = label_encoder_all_labels.fit_transform(df_filtered_all_labels['all_labels'])

    print("Checking class frequencies in levels...")
    levels_counts = df['levels'].value_counts()
    print(levels_counts)

    # Filter out rare classes with only 1 instance for levels
    rare_levels = levels_counts[levels_counts == 1].index.tolist()
    if rare_levels:
        print(f"Found rare levels with only 1 instance: {rare_levels}")
        print("Filtering out rare levels before encoding")
        df_filtered_levels = df[~df['levels'].isin(rare_levels)].copy()
        print(f"Removed {len(df) - len(df_filtered_levels)} rows with rare levels")
    else:
        df_filtered_levels = df.copy()

    label_encoder_levels = LabelEncoder()
    y_levels = label_encoder_levels.fit_transform(df_filtered_levels['levels'])

    print(f"Binary target distribution: {Counter(y_binary)}")
    print(f"All labels target distribution: {Counter(y_all_labels)}")
    print(f"Levels target distribution: {Counter(y_levels)}")

    # Split data: 70% training, 20% testing, 10% validation for binary classification
    print("\nSplitting data for binary classification...")
    X_temp, X_test, y_binary_temp, y_binary_test = train_test_split(
        X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
    )

    X_train, X_val, y_binary_train, y_binary_val = train_test_split(
        X_temp, y_binary_temp, test_size=0.125, random_state=42, stratify=y_binary_temp
    )

    # For all_labels classification, use the filtered dataset
    print("\nSplitting data for all_labels classification...")
    if len(df) != len(df_filtered_all_labels):
        # Use indices to extract the proper subset of X for all_labels
        all_labels_indices = df_filtered_all_labels.index
        X_all_labels = X[all_labels_indices]

        X_all_labels_temp, X_all_labels_test, y_all_labels_temp, y_all_labels_test = train_test_split(
            X_all_labels, y_all_labels, test_size=0.2, random_state=42, stratify=y_all_labels
        )

        X_all_labels_train, X_all_labels_val, y_all_labels_train, y_all_labels_val = train_test_split(
            X_all_labels_temp, y_all_labels_temp, test_size=0.125, random_state=42, stratify=y_all_labels_temp
        )
    else:
        # If no filtering was done, use the same splits as binary
        X_all_labels_temp, X_all_labels_test, y_all_labels_temp, y_all_labels_test = train_test_split(
            X, y_all_labels, test_size=0.2, random_state=42, stratify=y_all_labels
        )

        X_all_labels_train, X_all_labels_val, y_all_labels_train, y_all_labels_val = train_test_split(
            X_all_labels_temp, y_all_labels_temp, test_size=0.125, random_state=42, stratify=y_all_labels_temp
        )

    # For levels classification, use the filtered dataset
    print("\nSplitting data for levels classification...")
    if len(df) != len(df_filtered_levels):
        # Use indices to extract the proper subset of X for levels
        levels_indices = df_filtered_levels.index
        X_levels = X[levels_indices]

        X_levels_temp, X_levels_test, y_levels_temp, y_levels_test = train_test_split(
            X_levels, y_levels, test_size=0.2, random_state=42, stratify=y_levels
        )

        X_levels_train, X_levels_val, y_levels_train, y_levels_val = train_test_split(
            X_levels_temp, y_levels_temp, test_size=0.125, random_state=42, stratify=y_levels_temp
        )
    else:
        # If no filtering was done, use the same splits as binary
        X_levels_temp, X_levels_test, y_levels_temp, y_levels_test = train_test_split(
            X, y_levels, test_size=0.2, random_state=42, stratify=y_levels
        )

        X_levels_train, X_levels_val, y_levels_train, y_levels_val = train_test_split(
            X_levels_temp, y_levels_temp, test_size=0.125, random_state=42, stratify=y_levels_temp
        )

    print(f"Binary - Train set size: {X_train.shape[0]}")
    print(f"Binary - Validation set size: {X_val.shape[0]}")
    print(f"Binary - Test set size: {X_test.shape[0]}")

    print(f"All Labels - Train set size: {X_all_labels_train.shape[0]}")
    print(f"All Labels - Validation set size: {X_all_labels_val.shape[0]}")
    print(f"All Labels - Test set size: {X_all_labels_test.shape[0]}")

    print(f"Levels - Train set size: {X_levels_train.shape[0]}")
    print(f"Levels - Validation set size: {X_levels_val.shape[0]}")
    print(f"Levels - Test set size: {X_levels_test.shape[0]}")

    targets = {
        'binary': {
            'train': y_binary_train,
            'val': y_binary_val,
            'test': y_binary_test
        },
        'all_labels': {
            'train': y_all_labels_train,
            'val': y_all_labels_val,
            'test': y_all_labels_test
        },
        'levels': {
            'train': y_levels_train,
            'val': y_levels_val,
            'test': y_levels_test
        }
    }

    encoders = {
        'tfidf': tfidf_vectorizer,
        'all_labels': label_encoder_all_labels,
        'levels': label_encoder_levels
    }

    data_splits = {
        'binary': {
            'X_train': X_train,
            'X_val': X_val,
            'X_test': X_test
        },
        'all_labels': {
            'X_train': X_all_labels_train,
            'X_val': X_all_labels_val,
            'X_test': X_all_labels_test
        },
        'levels': {
            'X_train': X_levels_train,
            'X_val': X_levels_val,
            'X_test': X_levels_test
        }
    }

    # Store original label values for interpretation
    original_labels = {
        'all_labels': df_filtered_all_labels['all_labels'].unique(),
        'levels': df_filtered_levels['levels'].unique()
    }

    return data_splits, targets, encoders, original_labels

# ===================================================================================================
# PART 4: MODEL TRAINING AND EVALUATION
# ===================================================================================================
def train_evaluate_models(data_splits, targets, target_type='binary'):
    """
    Train and evaluate multiple classifiers for the specified target type
    """
    print(f"\nTraining and evaluating models for {target_type} classification...")

    # Get the appropriate data splits for the target type
    X_train = data_splits[target_type]['X_train']
    X_val = data_splits[target_type]['X_val']
    X_test = data_splits[target_type]['X_test']

    y_train = targets[target_type]['train']
    y_val = targets[target_type]['val']
    y_test = targets[target_type]['test']

    # Define the models to be trained
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Multinomial NB': MultinomialNB(),
        'Support Vector Machine': SVC(kernel='linear', probability=True, random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'LightGBM': lgb.LGBMClassifier(random_state=42)
    }

    results = {}

    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)

        # Predict on training, validation, and test sets
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)

        # Calculate accuracy on all sets
        train_accuracy = accuracy_score(y_train, y_train_pred)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        # Calculate F1 score for test set
        test_f1 = f1_score(y_test, y_test_pred, average='weighted')

        print(f"{name} Results:")
        print(f"Training Accuracy: {train_accuracy:.4f}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test F1 Score (weighted): {test_f1:.4f}")

        # Print confusion matrix for test set
        print("\nConfusion Matrix (Test Set):")
        cm = confusion_matrix(y_test, y_test_pred)
        print(cm)

        # Generate classification report for test set
        print("\nClassification Report (Test Set):")
        cr = classification_report(y_test, y_test_pred)
        print(cr)

        # Store results
        results[name] = {
            'model': model,
            'train_accuracy': train_accuracy,
            'val_accuracy': val_accuracy,
            'test_accuracy': test_accuracy,
            'test_f1': test_f1
        }

    # Identify the best models based on validation accuracy
    best_models = sorted(results.items(), key=lambda x: x[1]['val_accuracy'], reverse=True)
    print("\nModels ranked by validation accuracy:")
    for i, (name, result) in enumerate(best_models):
        print(f"{i+1}. {name}: {result['val_accuracy']:.4f}")

    # Return the results dictionary
    return results

# ===================================================================================================
# PART 5: MODEL SAVING
# ===================================================================================================
def save_best_model(results, encoders, target_type='binary'):
    """
    Save the best model and required encoders
    """
    print("\nSaving the best model and encoders...")

    # Choose the best model based on validation accuracy
    best_model_name = max(results.items(), key=lambda x: x[1]['val_accuracy'])[0]
    best_model = results[best_model_name]['model']
    best_model_accuracy = results[best_model_name]['val_accuracy']

    print(f"Best model: {best_model_name} with validation accuracy: {best_model_accuracy:.4f}")

    # Save model and encoders
    model_filename = f'model_{target_type}.pkl'
    joblib.dump(best_model, model_filename)

    tfidf_filename = f'tfidf_vectorizer_{target_type}.pkl'
    joblib.dump(encoders['tfidf'], tfidf_filename)

    if target_type == 'all_labels':
        encoder_filename = f'label_encoder_all_labels.pkl'
        joblib.dump(encoders['all_labels'], encoder_filename)
    elif target_type == 'levels':
        encoder_filename = f'label_encoder_levels.pkl'
        joblib.dump(encoders['levels'], encoder_filename)

    # Save model info
    model_info = {
        'model_name': best_model_name,
        'target_type': target_type,
        'accuracy': best_model_accuracy,
        'saved_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    info_filename = f'model_info_{target_type}.json'
    with open(info_filename, 'w') as f:
        import json
        json.dump(model_info, f, indent=4)

    print(f"Model and encoders for {target_type} saved successfully!")

    return best_model, encoders

# ===================================================================================================
# PART 6: PREDICTION FUNCTION
# ===================================================================================================
def predict_anxiety(text, preprocessor, models, encoders, original_labels):
    """
    Predict anxiety, all_labels, and levels for a given text input
    """
    print("\n" + "="*80)
    print("PREDICTION RESULTS FOR USER INPUT TEXT")
    print("="*80)

    # Preprocess the input text
    cleaned_text = preprocessor.preprocess(text)
    print(f"\nOriginal Text: {text}")
    print(f"Preprocessed Text: {cleaned_text}")

    # Vectorize the cleaned text using the saved TF-IDF vectorizers
    X_binary = encoders['tfidf'].transform([cleaned_text])

    # Make predictions using the saved models
    # 1. Binary anxiety prediction (0 or 1)
    binary_model = models['binary']
    binary_prediction = binary_model.predict(X_binary)[0]
    binary_proba = binary_model.predict_proba(X_binary)[0]

    print("\n1. ANXIETY PREDICTION (Binary)")
    print("------------------------------")
    print(f"Prediction: {'Anxious' if binary_prediction == 1 else 'Not Anxious'}")
    print(f"Confidence: {max(binary_proba)*100:.2f}%")

    # 2. All_labels prediction
    all_labels_model = models['all_labels']
    all_labels_prediction = all_labels_model.predict(X_binary)[0]
    all_labels_proba = all_labels_model.predict_proba(X_binary)[0]

    # Convert encoded prediction back to original label
    original_all_labels = encoders['all_labels'].inverse_transform([all_labels_prediction])[0]

    print("\n2. ALL LABELS PREDICTION")
    print("------------------------------")
    print(f"Prediction: {original_all_labels}")
    print(f"Confidence: {max(all_labels_proba)*100:.2f}%")

    # 3. Levels prediction
    levels_model = models['levels']
    levels_prediction = levels_model.predict(X_binary)[0]
    levels_proba = levels_model.predict_proba(X_binary)[0]

    # Convert encoded prediction back to original label
    original_levels = encoders['levels'].inverse_transform([levels_prediction])[0]

    print("\n3. LEVELS PREDICTION")
    print("------------------------------")
    print(f"Prediction: {original_levels}")
    print(f"Confidence: {max(levels_proba)*100:.2f}%")

    print("\n" + "="*80)

    return {
        'anxiety': {'prediction': 'Anxious' if binary_prediction == 1 else 'Not Anxious', 'confidence': max(binary_proba)*100},
        'all_labels': {'prediction': original_all_labels, 'confidence': max(all_labels_proba)*100},
        'levels': {'prediction': original_levels, 'confidence': max(levels_proba)*100}
    }

# ===================================================================================================
# PART 7: MAIN FUNCTION TO RUN THE PIPELINE
# ===================================================================================================
# ===================================================================================================
# PART 7: MAIN FUNCTION TO RUN THE PIPELINE
# ===================================================================================================
def main(filepath):
    """
    Run the end-to-end pipeline for all three target types and enable prediction
    """
    print("Starting Urdu Anxiety Prediction Pipeline...")

    # 1. Load and explore data
    df = load_data(filepath)

    # 2. Create preprocessor
    preprocessor = UrduTextPreprocessor()

    # 3. Prepare data for all target types
    data_splits, targets, encoders, original_labels = prepare_data(df, preprocessor)

    # 4. Train and evaluate models for each target type
    binary_results = train_evaluate_models(data_splits, targets, target_type='binary')
    all_labels_results = train_evaluate_models(data_splits, targets, target_type='all_labels')
    levels_results = train_evaluate_models(data_splits, targets, target_type='levels')

    # 5. Save best models for each target type
    binary_best_model, _ = save_best_model(binary_results, encoders, target_type='binary')
    all_labels_best_model, _ = save_best_model(all_labels_results, encoders, target_type='all_labels')
    levels_best_model, _ = save_best_model(levels_results, encoders, target_type='levels')

    # Organize models and encoders for prediction
    best_models = {
        'binary': binary_best_model,
        'all_labels': all_labels_best_model,
        'levels': levels_best_model
    }

    print("\nAll models trained successfully!")

    # 6. User input prediction
    while True:
        print("\n" + "="*80)
        print("URDU ANXIETY PREDICTION SYSTEM")
        print("="*80)
        print("\nEnter Urdu text to predict anxiety level (or type 'exit' to quit):")
        user_text = input(">> ")

        if user_text.lower() == 'exit':
            print("Exiting prediction system.")
            break

        # Make predictions using all three models
        predictions = predict_anxiety(user_text, preprocessor, best_models, encoders, original_labels)

        # Ask if user wants to continue
        continue_choice = input("\nDo you want to make another prediction? (y/n): ")
        if continue_choice.lower() != 'y':
            print("Thank you for using the Urdu Anxiety Prediction System!")
            break

    return best_models, encoders

# ===================================================================================================
# EXECUTE THE PIPELINE
# ===================================================================================================
if __name__ == "__main__":
    # Specify your dataset file path here
    filepath = "updated_anxiety_dataset.xlsx"  # CHANGE THIS TO YOUR ACTUAL DATASET FILE PATH

    # Run the pipeline for all target types and enable prediction
    best_models, prediction_encoders = main(filepath)


Starting Urdu Anxiety Prediction Pipeline...
Starting to load data from: updated_anxiety_dataset.xlsx
Dataset shape: (12000, 4)

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   anxiety     12000 non-null  int64 
 1   all_labels  12000 non-null  object
 2   levels      12000 non-null  object
 3   Urdu_text   12000 non-null  object
dtypes: int64(1), object(3)
memory usage: 375.1+ KB
None

Missing values:
anxiety       0
all_labels    0
levels        0
Urdu_text     0
dtype: int64

Distribution of anxiety labels:
anxiety
1    7270
0    4730
Name: count, dtype: int64

Distribution of all_labels:
all_labels
Normal                 4730
['agoraphobia']        2578
['panic']              1513
['socialanxiety']      1293
['general']            1177
['phobia']              367
['selectivemutism']     341
['phobia'                 1
N

Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Training Accuracy: 0.9144
Validation Accuracy: 0.8342
Test Accuracy: 0.7983
Test F1 Score (weighted): 0.7982

Confusion Matrix (Test Set):
[[ 701  245]
 [ 239 1215]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.75      0.74      0.74       946
           1       0.83      0.84      0.83      1454

    accuracy                           0.80      2400
   macro avg       0.79      0.79      0.79      2400
weighted avg       0.80      0.80      0.80      2400


Training LightGBM...
[LightGBM] [Info] Number of positive: 5089, number of negative: 3311




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107949 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50647
[LightGBM] [Info] Number of data points in the train set: 8400, number of used features: 1742
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.605833 -> initscore=0.429831
[LightGBM] [Info] Start training from score 0.429831




LightGBM Results:
Training Accuracy: 0.9050
Validation Accuracy: 0.8242
Test Accuracy: 0.8142
Test F1 Score (weighted): 0.8142

Confusion Matrix (Test Set):
[[ 725  221]
 [ 225 1229]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.76      0.77      0.76       946
           1       0.85      0.85      0.85      1454

    accuracy                           0.81      2400
   macro avg       0.81      0.81      0.81      2400
weighted avg       0.81      0.81      0.81      2400


Models ranked by validation accuracy:
1. XGBoost: 0.8342
2. Random Forest: 0.8300
3. LightGBM: 0.8242
4. Support Vector Machine: 0.8233
5. Multinomial NB: 0.7942
6. K-Nearest Neighbors: 0.3958

Training and evaluating models for all_labels classification...

Training Random Forest...
Random Forest Results:
Training Accuracy: 0.9995
Validation Accuracy: 0.5833
Test Accuracy: 0.5825
Test F1 Score (weighted): 0.4964

Confusion Matrix (Test Set):
[[887 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Support Vector Machine Results:
Training Accuracy: 0.7668
Validation Accuracy: 0.6000
Test Accuracy: 0.5938
Test F1 Score (weighted): 0.5349

Confusion Matrix (Test Set):
[[857  56   7  12   0   0  14]
 [118 332  10  42   5   1   8]
 [103  98  18   4   0   0  12]
 [ 27  82   5 189   0   0   0]
 [ 16  41   5   3   4   0   4]
 [ 26  32   2   0   1   3   4]
 [167  49  14   3   2   2  22]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.65      0.91      0.76       946
           1       0.48      0.64      0.55       516
           2       0.30      0.08      0.12       235
           3       0.75      0.62      0.68       303
           4       0.33      0.05      0.09        73
           5       0.50      0.04      0.08        68
           6       0.34      0.08      0.14       259

    accuracy                           0.59      2400
   macro avg       0.48      0.35      0.35      2400
weighted avg       0.55      0.59 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Training Accuracy: 0.8808
Validation Accuracy: 0.5925
Test Accuracy: 0.6004
Test F1 Score (weighted): 0.5550

Confusion Matrix (Test Set):
[[841  48  18  13   0   2  24]
 [102 321  16  49   7   8  13]
 [ 92  94  33   5   1   2   8]
 [ 26  67   4 204   1   0   1]
 [ 11  41   4   8   7   0   2]
 [ 19  32   4   1   0   9   3]
 [150  51  22   4   2   4  26]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.68      0.89      0.77       946
           1       0.49      0.62      0.55       516
           2       0.33      0.14      0.20       235
           3       0.72      0.67      0.70       303
           4       0.39      0.10      0.15        73
           5       0.36      0.13      0.19        68
           6       0.34      0.10      0.15       259

    accuracy                           0.60      2400
   macro avg       0.47      0.38      0.39      2400
weighted avg       0.55      0.60      0.55      



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.102166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50735
[LightGBM] [Info] Number of data points in the train set: 8399, number of used features: 1732
[LightGBM] [Info] Start training from score -0.930862
[LightGBM] [Info] Start training from score -1.538106
[LightGBM] [Info] Start training from score -2.321697
[LightGBM] [Info] Start training from score -2.070788
[LightGBM] [Info] Start training from score -3.486792
[LightGBM] [Info] Start training from score -3.559404
[LightGBM] [Info] Start training from score -2.227933




LightGBM Results:
Training Accuracy: 0.9333
Validation Accuracy: 0.5983
Test Accuracy: 0.5946
Test F1 Score (weighted): 0.5548

Confusion Matrix (Test Set):
[[821  51  14  16   0   1  43]
 [ 97 314  21  51   9   8  16]
 [ 89  90  28  11   2   1  14]
 [ 17  63   3 212   1   0   7]
 [ 14  36   7   6   4   1   5]
 [ 16  29   4   3   0  12   4]
 [141  45  24   5   3   5  36]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.69      0.87      0.77       946
           1       0.50      0.61      0.55       516
           2       0.28      0.12      0.17       235
           3       0.70      0.70      0.70       303
           4       0.21      0.05      0.09        73
           5       0.43      0.18      0.25        68
           6       0.29      0.14      0.19       259

    accuracy                           0.59      2400
   macro avg       0.44      0.38      0.39      2400
weighted avg       0.54      0.59      0.55     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Training Accuracy: 0.8780
Validation Accuracy: 0.6125
Test Accuracy: 0.5904
Test F1 Score (weighted): 0.5585

Confusion Matrix (Test Set):
[[ 20  54 137   6  41]
 [ 19  93 111  19 136]
 [ 26  39 823  12  46]
 [  3  19  26 198  57]
 [ 10  79  98  45 283]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.26      0.08      0.12       258
           1       0.33      0.25      0.28       378
           2       0.69      0.87      0.77       946
           3       0.71      0.65      0.68       303
           4       0.50      0.55      0.53       515

    accuracy                           0.59      2400
   macro avg       0.50      0.48      0.47      2400
weighted avg       0.55      0.59      0.56      2400


Training LightGBM...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49631
[LightGBM] [Info] Number of data points in the train set: 8400, number of used features: 1714
[LightGBM] [Info] Start training from score -2.226948
[LightGBM] [Info] Start training from score -1.849843
[LightGBM] [Info] Start training from score -0.930981
[LightGBM] [Info] Start training from score -2.070907
[LightGBM] [Info] Start training from score -1.538780




LightGBM Results:
Training Accuracy: 0.9013
Validation Accuracy: 0.6000
Test Accuracy: 0.5904
Test F1 Score (weighted): 0.5693

Confusion Matrix (Test Set):
[[ 36  49 121   8  44]
 [ 21 111  86  21 139]
 [ 34  42 799  17  54]
 [  3  20  27 191  62]
 [ 14  91  77  53 280]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.33      0.14      0.20       258
           1       0.35      0.29      0.32       378
           2       0.72      0.84      0.78       946
           3       0.66      0.63      0.64       303
           4       0.48      0.54      0.51       515

    accuracy                           0.59      2400
   macro avg       0.51      0.49      0.49      2400
weighted avg       0.56      0.59      0.57      2400


Models ranked by validation accuracy:
1. XGBoost: 0.6125
2. Support Vector Machine: 0.6117
3. LightGBM: 0.6000
4. Multinomial NB: 0.5883
5. Random Forest: 0.5833
6. K-Nearest Neighbors: 0.3942

Saving t