In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
import re
from sklearn.utils import resample

# Preprocessing function
def preprocess_text(text):
    # Remove punctuation, normalize text, and lowercase for Urdu
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Load larger and diverse Urdu news dataset
# Replace this with the actual file path of your larger dataset
data = pd.read_excel("/content/news.xlsx", engine="openpyxl")  # Use the appropriate engine for Excel files

# Preprocess the 'news' column using the preprocessing function
data['news'] = data['news'].apply(preprocess_text)

# Splitting the data
X = data['news']
y = data['label']

# Encode target labels as integers
y = y.astype('category').cat.codes  # Convert categories to numeric labels

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to feature vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

# Base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)
svm = SVC(probability=True, random_state=42)

# Train base models
rf.fit(X_train_tfidf, y_train)
lr.fit(X_train_tfidf, y_train)
svm.fit(X_train_tfidf, y_train)

# Generate predictions from base models
rf_preds = rf.predict_proba(X_train_tfidf)
lr_preds = lr.predict_proba(X_train_tfidf)
svm_preds = svm.predict_proba(X_train_tfidf)

# Combine predictions as input for meta-model
stacked_features = np.hstack((rf_preds, lr_preds, svm_preds))

# Define the meta-model (RNN)
meta_model = Sequential()
meta_model.add(Dense(128, activation='relu', input_dim=stacked_features.shape[1]))
meta_model.add(Dropout(0.5))
meta_model.add(Dense(64, activation='relu'))
meta_model.add(Dropout(0.5))
meta_model.add(Dense(len(y.unique()), activation='softmax'))  # Multiclass classification

meta_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the meta-model
meta_model.fit(stacked_features, y_train, epochs=10, batch_size=32, verbose=1)

# Generate test predictions
rf_test_preds = rf.predict_proba(X_test_tfidf)
lr_test_preds = lr.predict_proba(X_test_tfidf)
svm_test_preds = svm.predict_proba(X_test_tfidf)
stacked_test_features = np.hstack((rf_test_preds, lr_test_preds, svm_test_preds))

# Evaluate the meta-model
meta_test_preds = np.argmax(meta_model.predict(stacked_test_features), axis=1)

# Performance Metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Confidence Interval function using Bootstrap
def bootstrap_confidence_interval(y_true, y_pred, n_iterations=100, alpha=0.05):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for _ in range(n_iterations):
        # Resample data with replacement
        X_resampled, y_resampled = resample(X_test_tfidf, y_test, n_samples=len(y_test), random_state=42)
        # Generate predictions
        rf_test_preds_resampled = rf.predict_proba(X_resampled)
        lr_test_preds_resampled = lr.predict_proba(X_resampled)
        svm_test_preds_resampled = svm.predict_proba(X_resampled)
        stacked_resampled_features = np.hstack((rf_test_preds_resampled, lr_test_preds_resampled, svm_test_preds_resampled))
        meta_resampled_preds = np.argmax(meta_model.predict(stacked_resampled_features), axis=1)

        # Calculate metrics for resampled data
        accuracy, precision, recall, f1 = calculate_metrics(y_resampled, meta_resampled_preds)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate the confidence intervals for each metric
    accuracy_ci = np.percentile(accuracy_scores, [100*alpha/2, 100*(1-alpha/2)])
    precision_ci = np.percentile(precision_scores, [100*alpha/2, 100*(1-alpha/2)])
    recall_ci = np.percentile(recall_scores, [100*alpha/2, 100*(1-alpha/2)])
    f1_ci = np.percentile(f1_scores, [100*alpha/2, 100*(1-alpha/2)])

    return accuracy_ci, precision_ci, recall_ci, f1_ci

# Calculate confidence intervals for performance metrics
accuracy_ci, precision_ci, recall_ci, f1_ci = bootstrap_confidence_interval(y_test, meta_test_preds)

# Print results
print(f"Accuracy: {accuracy_score(y_test, meta_test_preds)}")
print(f"Precision: {precision_score(y_test, meta_test_preds, average='weighted')}")
print(f"Recall: {recall_score(y_test, meta_test_preds, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, meta_test_preds, average='weighted')}")

# Print Confidence Intervals
print(f"Confidence Interval for Accuracy: {accuracy_ci}")
print(f"Confidence Interval for Precision: {precision_ci}")
print(f"Confidence Interval for Recall: {recall_ci}")
print(f"Confidence Interval for F1-Score: {f1_ci}")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9402 - loss: 0.2734
Epoch 2/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9987 - loss: 0.0078
Epoch 3/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0039
Epoch 4/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9986 - loss: 0.0038
Epoch 5/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0015
Epoch 6/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0040
Epoch 7/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 5.5411e-04
Epoch 8/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0027
Epoch 9/10
[1m103/103[0m [32m━━━━━━━━━━━━━━━