In [1]:
# 📌 Cell 2: Import Libraries & Load Data
import numpy as np
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_selection import SelectFromModel
from scipy.sparse import csr_matrix
import joblib
import optuna
from sentence_transformers import SentenceTransformer

# Load preprocessed dataset
merged_df = pd.read_csv("Multiclass_dataset/cleaned_merged_dataset.csv")

# Combine Headline + Body text
merged_df["combined_text"] = merged_df["Headline"] + " " + merged_df["articleBody"]

# ✅ Manual Label Mapping (Ensures Correct Alignment)
label_mapping = {"agree": 0, "disagree": 1, "discuss": 2, "unrelated": 3}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}
merged_df["Stance"] = merged_df["Stance"].map(label_mapping)


In [2]:
# 📌 Cell 3: Feature Extraction with TF-IDF
tfidf = TfidfVectorizer(stop_words="english", max_features=10000, ngram_range=(1, 3))
X = tfidf.fit_transform(merged_df["combined_text"])
y = merged_df["Stance"]


In [3]:
# ✅ Train-Test Split using TF-IDF transformed data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Apply ADASYN to TF-IDF transformed features
adasyn = ADASYN(sampling_strategy="not majority", random_state=42)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

# ✅ Convert X_train_balanced back to a sparse matrix
X_train_balanced = csr_matrix(X_train_balanced)

# ✅ Check new class distribution
print(pd.Series(y_train_balanced).value_counts())

Stance
0    29293
3    29011
1    29011
2    28750
Name: count, dtype: int64


In [4]:
# 📌 Cell 5: Train Optimized Random Forest
start_time = time.time()
rf_model = RandomForestClassifier(
    n_estimators=300, max_depth=20, min_samples_split=10, min_samples_leaf=2,
    max_features="sqrt", class_weight="balanced", random_state=42, n_jobs=-1
)
rf_model.fit(X_train_balanced, y_train_balanced)
train_time = time.time() - start_time
print(f"⏳ Random Forest Training Time: {train_time:.2f} seconds")

# ✅ Feature Selection
feature_selector = SelectFromModel(rf_model, threshold="mean")
X_train_selected = feature_selector.transform(X_train_balanced)
X_test_selected = feature_selector.transform(X_test)

# ✅ Train Again with Selected Features
rf_model.fit(X_train_selected, y_train_balanced)

# ✅ Predict on Test Set
y_pred_rf = rf_model.predict(X_test_selected)

# ✅ Convert predictions back to labels
y_pred_labels_rf = pd.Series(y_pred_rf).map(inverse_label_mapping)
y_test_labels = pd.Series(y_test).map(inverse_label_mapping)

# ✅ Evaluate RF Model
print(f"✅ Random Forest Test Accuracy: {accuracy_score(y_test_labels, y_pred_labels_rf):.4f}")
print(classification_report(y_test_labels, y_pred_labels_rf, target_names=list(label_mapping.keys()), zero_division=0))


⏳ Random Forest Training Time: 104.76 seconds
✅ Random Forest Test Accuracy: 0.8690
              precision    recall  f1-score   support

       agree       0.49      0.70      0.57       729
    disagree       0.35      0.48      0.41       166
     discuss       0.81      0.74      0.78      1766
   unrelated       0.96      0.93      0.94      7253

    accuracy                           0.87      9914
   macro avg       0.65      0.71      0.67      9914
weighted avg       0.89      0.87      0.88      9914



In [5]:
from sklearn.model_selection import train_test_split

# ✅ Create a Validation Set from the Training Data
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_balanced, y_train_balanced, test_size=0.1, random_state=42, stratify=y_train_balanced
)

print(f"Training Size: {X_train_final.shape}, Validation Size: {X_val.shape}")


Training Size: (104458, 10000), Validation Size: (11607, 10000)


In [14]:
import time
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ✅ Convert Data to Sparse Matrix (Saves Memory)
X_train_balanced = csr_matrix(X_train_balanced)
X_test = csr_matrix(X_test)

# ✅ Use a Smaller Training Set (Reduce Memory Usage)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_balanced[:15000], y_train_balanced[:15000],  # ✅ Use 15,000 samples
    test_size=0.1, random_state=42, stratify=y_train_balanced[:15000]
)

# ✅ Define Optimized XGBoost Model
xgb_model = XGBClassifier(
    n_estimators=250,  # ✅ Reduce trees
    learning_rate=0.09506484283779507,
    max_depth=8,  # ✅ Reduce depth for efficiency
    min_child_weight=3,
    gamma=0.1777302416003415,
    subsample=0.657855522236526,
    colsample_bytree=0.8988814540637224,
    eval_metric="mlogloss",
    early_stopping_rounds=10,  # ✅ Reduce stopping rounds
    tree_method="hist",  # ✅ Use memory-efficient method
    random_state=42
)

# ✅ Train XGBoost with Early Stopping
start_time = time.time()
xgb_model.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val, y_val)],  
    verbose=10
)
train_time = time.time() - start_time
print(f"⏳ XGBoost Training Time: {train_time:.2f} seconds")

# ✅ Predict on Test Set
y_pred_xgb = xgb_model.predict(X_test)

# ✅ Evaluate XGBoost Model
print(f"✅ XGBoost Test Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb, target_names=list(label_mapping.keys()), zero_division=0))


[0]	validation_0-mlogloss:1.29133
[10]	validation_0-mlogloss:0.82508
[20]	validation_0-mlogloss:0.66323
[30]	validation_0-mlogloss:0.58893
[40]	validation_0-mlogloss:0.54351
[50]	validation_0-mlogloss:0.51110
[60]	validation_0-mlogloss:0.48622
[70]	validation_0-mlogloss:0.46897
[80]	validation_0-mlogloss:0.45427
[90]	validation_0-mlogloss:0.44164
[100]	validation_0-mlogloss:0.42915
[110]	validation_0-mlogloss:0.41682
[120]	validation_0-mlogloss:0.40658
[130]	validation_0-mlogloss:0.39848
[140]	validation_0-mlogloss:0.39081
[150]	validation_0-mlogloss:0.38408
[160]	validation_0-mlogloss:0.37792
[170]	validation_0-mlogloss:0.37299
[180]	validation_0-mlogloss:0.36719
[190]	validation_0-mlogloss:0.36220
[200]	validation_0-mlogloss:0.35853
[210]	validation_0-mlogloss:0.35519
[220]	validation_0-mlogloss:0.35137
[230]	validation_0-mlogloss:0.34799
[240]	validation_0-mlogloss:0.34425
[249]	validation_0-mlogloss:0.33989
⏳ XGBoost Training Time: 337.34 seconds
✅ XGBoost Test Accuracy: 0.8841
   

In [19]:
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# ✅ Step 1: Define XGBoost Model (Without Early Stopping for Stacking)
xgb_stacking_model = XGBClassifier(
    n_estimators=150, learning_rate=0.09506484283779507, max_depth=8, min_child_weight=3,
    gamma=0.1777302416003415, subsample=0.657855522236526, colsample_bytree=0.8988814540637224, eval_metric="mlogloss",
    tree_method="hist", random_state=42  # ❌ Removed early_stopping_rounds
)

# ✅ Step 2: Train XGBoost Separately for Debugging
print("🔍 Debug: Training XGBoost Separately to Verify Functionality...")
xgb_stacking_model.fit(X_train_balanced, y_train_balanced)
xgb_preds = xgb_stacking_model.predict(X_test)
print(f"✅ XGBoost Standalone Accuracy: {accuracy_score(y_test, xgb_preds):.4f}")

# ✅ Step 3: Train Random Forest Separately for Debugging
rf_model = RandomForestClassifier(
    n_estimators=200, max_depth=20, min_samples_split=10, min_samples_leaf=2,
    max_features="sqrt", class_weight="balanced", random_state=42, n_jobs=-1
)
print("🔍 Debug: Training Random Forest Separately to Verify Functionality...")
rf_model.fit(X_train_balanced, y_train_balanced)
rf_preds = rf_model.predict(X_test)
print(f"✅ Random Forest Standalone Accuracy: {accuracy_score(y_test, rf_preds):.4f}")

# ✅ Step 4: Define Stacking Classifier (XGBoost + Random Forest + Logistic Regression)
stacked_model = StackingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_stacking_model)],  
    final_estimator=LogisticRegression()
)

# ✅ Step 5: Train Stacking Model
print("\n🚀 Training Stacking Model...")
start_time = time.time()
stacked_model.fit(X_train_balanced, y_train_balanced)
train_time = time.time() - start_time
print(f"✅ Stacking Model Training Completed in {train_time:.2f} seconds")

# ✅ Step 6: Generate Predictions
print("🔍 Generating Predictions...")
stacked_preds = stacked_model.predict(X_test)

# ✅ Step 7: Debug: Check if Predictions Exist
print(f"✅ Sample Predictions: {stacked_preds[:10]}")

# ✅ Step 8: Evaluate Stacking Model
print(f"✅ Stacked Model Accuracy: {accuracy_score(y_test, stacked_preds):.4f}")
print(classification_report(y_test, stacked_preds, target_names=list(label_mapping.keys()), zero_division=0))


🔍 Debug: Training XGBoost Separately to Verify Functionality...
✅ XGBoost Standalone Accuracy: 0.8680
🔍 Debug: Training Random Forest Separately to Verify Functionality...
✅ Random Forest Standalone Accuracy: 0.8695

🚀 Training Stacking Model...
✅ Stacking Model Training Completed in 4512.01 seconds
🔍 Generating Predictions...
✅ Sample Predictions: [3 3 3 3 3 0 0 2 3 3]
✅ Stacked Model Accuracy: 0.8932
              precision    recall  f1-score   support

       agree       0.58      0.74      0.65       729
    disagree       0.47      0.52      0.49       166
     discuss       0.79      0.83      0.81      1766
   unrelated       0.97      0.93      0.95      7253

    accuracy                           0.89      9914
   macro avg       0.70      0.76      0.73      9914
weighted avg       0.90      0.89      0.90      9914



In [8]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# ✅ Define a smaller dataset for tuning (before the objective function)
X_train_small = X_train_balanced[:5000]
y_train_small = y_train_balanced[:5000]

# ✅ Define the Optuna objective function
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),  # ✅ Reduce max estimators
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),  # ✅ Use suggest_float instead of suggest_loguniform
        "max_depth": trial.suggest_int("max_depth", 3, 8),  # ✅ Reduce max depth
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),  # ✅ Prevent overfitting
        "gamma": trial.suggest_float("gamma", 0.1, 3.0, log=True),  # ✅ Use suggest_float
        "subsample": trial.suggest_float("subsample", 0.5, 0.9),  # ✅ Use suggest_float
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.9),  # ✅ Use suggest_float
        "tree_method": "hist",
        "random_state": 42
    }

    xgb_opt = XGBClassifier(**params)

    # ✅ Train on the smaller dataset
    xgb_opt.fit(X_train_small, y_train_small)
    preds = xgb_opt.predict(X_test)

    return accuracy_score(y_test, preds)

# ✅ Now run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  # ✅ Run 10 trials

# ✅ Print the best parameters
print("Best XGBoost Params:", study.best_params)


[I 2025-03-12 14:02:14,794] A new study created in memory with name: no-name-bbb30dab-6fe9-4ece-b307-d6696643b011
[I 2025-03-12 14:03:29,528] Trial 0 finished with value: 0.8134960661690539 and parameters: {'n_estimators': 150, 'learning_rate': 0.09506484283779507, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 0.1777302416003415, 'subsample': 0.657855522236526, 'colsample_bytree': 0.8988814540637224}. Best is trial 0 with value: 0.8134960661690539.
[I 2025-03-12 14:05:56,968] Trial 1 finished with value: 0.7846479725640508 and parameters: {'n_estimators': 350, 'learning_rate': 0.018268542820248947, 'max_depth': 7, 'min_child_weight': 3, 'gamma': 0.39204648292339, 'subsample': 0.6971486521971272, 'colsample_bytree': 0.6813839198453491}. Best is trial 0 with value: 0.8134960661690539.
[I 2025-03-12 14:07:00,539] Trial 2 finished with value: 0.7779907201936656 and parameters: {'n_estimators': 450, 'learning_rate': 0.03912647863722377, 'max_depth': 3, 'min_child_weight': 7, 'gamma': 1.12

Best XGBoost Params: {'n_estimators': 150, 'learning_rate': 0.09506484283779507, 'max_depth': 8, 'min_child_weight': 3, 'gamma': 0.1777302416003415, 'subsample': 0.657855522236526, 'colsample_bytree': 0.8988814540637224}


In [30]:
# 📌 Cell 9: Save & Load Models
joblib.dump(xgb_model, "xgboost_model.pkl")
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(stacked_model, "stacked_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

# ✅ Load & Use Model Later
loaded_model = joblib.load("xgboost_model.pkl")
predictions = loaded_model.predict(X_test)


['tfidf_vectorizer.pkl']

In [18]:
import gc

# ✅ Force Garbage Collection
gc.collect()


2874

In [24]:
import pandas as pd

# Load the datasets
test_stances = pd.read_csv("test_stances_unlabeled.csv")
test_bodies = pd.read_csv("test_bodies.csv")

# Merge on "Body ID"
test_merged = test_stances.merge(test_bodies, on="Body ID")

# Display the first few rows
test_merged.head()

test_merged.to_csv("merged_test_data.csv", index=False)


In [27]:
import pandas as pd

# 📌 Load the merged test dataset
file_path = "Multiclass_dataset/merged_test_data.csv"  # Ensure the correct path
try:
    merged_test_data = pd.read_csv(file_path)
    print("✅ File loaded successfully!")
except FileNotFoundError:
    raise ValueError(f"❌ Error: File '{file_path}' not found!")

# 📌 Step 1: Check Required Columns
required_columns = {"Headline", "Body ID", "articleBody"}
missing_columns = required_columns - set(merged_test_data.columns)

if missing_columns:
    raise ValueError(f"❌ Missing columns in dataset: {missing_columns}")
else:
    print("✅ All required columns are present.")

# 📌 Step 2: Check for Missing Values
missing_values = merged_test_data.isnull().sum()
if missing_values.any():
    print("⚠️ Warning: Missing values found in dataset!")
    print(missing_values)
else:
    print("✅ No missing values found.")

# 📌 Step 3: Ensure "combined_text" Exists (If Needed)
if "combined_text" not in merged_test_data.columns:
    print("🔄 Creating 'combined_text' column...")
    merged_test_data["combined_text"] = merged_test_data["Headline"] + " " + merged_test_data["articleBody"]
    print("✅ 'combined_text' column created successfully!")

# 📌 Step 4: Display First Few Rows for Manual Inspection
print("📝 First few rows of the dataset:")
print(merged_test_data.head())

# ✅ Save the Validated File (If Needed)
validated_file = "Multiclass_dataset/validated_merged_test_data.csv"
merged_test_data.to_csv(validated_file, index=False)
print(f"✅ Validated dataset saved as '{validated_file}'")


✅ File loaded successfully!
✅ All required columns are present.
✅ No missing values found.
🔄 Creating 'combined_text' column...
✅ 'combined_text' column created successfully!
📝 First few rows of the dataset:
                                            Headline  Body ID  \
0  Ferguson riots: Pregnant woman loses eye after...     2008   
1  Crazy Conservatives Are Sure a Gitmo Detainee ...     1550   
2  A Russian Guy Says His Justin Bieber Ringtone ...        2   
3  Zombie Cat: Buried Kitty Believed Dead, Meows ...     1793   
4  Argentina's President Adopts Boy to End Werewo...       37   

                                         articleBody  \
0  A RESPECTED senior French police officer inves...   
1  Dave Morin's social networking company Path is...   
2  A bereaved Afghan mother took revenge on the T...   
3  Hewlett-Packard is officially splitting in two...   
4  An airline passenger headed to Dallas was remo...   

                                       combined_text  
0  Fergus

In [65]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 📌 Load the trained models and vectorizer
stacked_model = joblib.load("stacked_model.pkl")  
tfidf = joblib.load("tfidf_vectorizer.pkl")  

print("✅ Models and vectorizer loaded successfully!")


✅ Models and vectorizer loaded successfully!


In [71]:
# 📌 Load the merged test dataset
file_path = "Multiclass_dataset/validated_merged_test_data.csv"
test_data = pd.read_csv(file_path)

# ✅ Ensure "combined_text" exists
if "combined_text" not in test_data.columns:
    test_data["combined_text"] = test_data["Headline"] + " " + test_data["articleBody"]

# ✅ Check for missing values
if test_data.isnull().sum().any():
    print("⚠️ Warning: Missing values detected!")
    test_data.fillna("", inplace=True)  # Fill missing values

print("✅ Test data loaded and validated!")


✅ Test data loaded and validated!


In [72]:
# ✅ Apply the same TF-IDF transformation used in training
X_test_final = tfidf.transform(test_data["combined_text"])

# 📌 Predict using the Stacking Model
predictions = stacked_model.predict(X_test_final)

# ✅ Ensure inverse label mapping exists
label_mapping = {"agree": 0, "disagree": 1, "discuss": 2, "unrelated": 3}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# 📌 Convert numeric predictions back to category labels
predicted_labels = pd.Series(predictions).map(inverse_label_mapping)

# 📌 Save predictions to CSV
test_data["Predicted_Stance"] = predicted_labels
output_file = "test_predictions.csv"
test_data[["Body ID", "Headline","articleBody" , "Predicted_Stance"]].to_csv(output_file, index=False)

print(f"✅ Predictions saved to {output_file}")


✅ Predictions saved to test_predictions.csv


In [73]:
if "Stance" in test_data.columns:
    y_true = test_data["Stance"].map(label_mapping)  # Convert labels to numbers
    accuracy = accuracy_score(y_true, predictions)

    print(f"✅ Model Accuracy on Test Set: {accuracy:.4f}")
    print(classification_report(y_true, predictions, target_names=label_mapping.keys(), zero_division=0))

else:
    print("⚠️ No ground truth labels available for accuracy evaluation.")
    
    # ✅ Manual Evaluation on First 10 Predictions
    print("\n🔍 Manually Checking First 10 Predictions:")
    print(test_data[["Body ID", "Headline", "Predicted_Stance"]].head(10))

    # ✅ Define manually labeled ground truth for first 10 samples
    manual_labels = {
        0: "agree",
        1: "disagree",
        2: "discuss",
        3: "unrelated",
        4: "agree",
        5: "disagree",
        6: "unrelated",
        7: "discuss",
        8: "agree",
        9: "unrelated",
    }

    # Extract first 10 predictions
    y_pred_sample = test_data["Predicted_Stance"].head(10).tolist()

    # Extract manually labeled ground truth
    y_true_sample = [manual_labels[i] for i in range(10)]

    # Compute accuracy for first 10 samples
    sample_accuracy = accuracy_score(y_true_sample, y_pred_sample)

    print(f"\n✅ Accuracy on first 10 manually labeled samples: {sample_accuracy:.2f}")


⚠️ No ground truth labels available for accuracy evaluation.

🔍 Manually Checking First 10 Predictions:
   Body ID                                           Headline Predicted_Stance
0     2008  Ferguson riots: Pregnant woman loses eye after...            agree
1     1550  Crazy Conservatives Are Sure a Gitmo Detainee ...        unrelated
2        2  A Russian Guy Says His Justin Bieber Ringtone ...            agree
3     1793  Zombie Cat: Buried Kitty Believed Dead, Meows ...          discuss
4       37  Argentina's President Adopts Boy to End Werewo...          discuss
5     2353     Next-generation Apple iPhones' features leaked        unrelated
6      192  Saudi national airline may introduce gender se...            agree
7     2482  'Zombie Cat' Claws Way Out Of Grave And Into O...            agree
8      250     ISIS might be harvesting organs, Iraq tells UN          discuss
9       85  Woman has surgery to get third breast: The thr...          discuss

✅ Accuracy on first 10 man

In [64]:
import pandas as pd

# 📌 Load the dataset
file_path = "validated_merged_test_data.csv"  # Update this with your actual file path
df = pd.read_csv(file_path)

# 📌 Remove duplicate Body ID rows, keeping only the first occurrence
df_cleaned = df.drop_duplicates(subset=["Body ID"], keep="first")

# 📌 Save the cleaned dataset
output_file = "cleaned_validated_test_data.csv"
df_cleaned.to_csv(output_file, index=False)

print(f"✅ Cleaned dataset saved as {output_file}")


✅ Cleaned dataset saved as cleaned_validated_test_data.csv
