Import the libraries.

In [4]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
import joblib  # For saving models

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')

In [5]:
# Load cleaned dataset
df = pd.read_csv("../dataset/unified_cleaned_dataset.csv")
df = df.dropna(subset=['clean_text'])  # Final safety check

# Display basic dataset information
print(f"Total samples: {len(df)}")
print(f"Hindi samples: {len(df[df['language']=='hi'])}")
print(f"English samples: {len(df[df['language']=='en'])}")
print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

Total samples: 81963
Hindi samples: 17123
English samples: 64840

Dataset shape: (81963, 3)

First few rows:


Unnamed: 0,clean_text,label,language
0,‡§Æ‡•ã‡§¶‡•Ä ‡§ï‡•á ‡§∂‡§æ‡§∏‡§® ‡§ï‡•á ‡§¶‡•å‡§∞‡§æ‡§® ‡§ó‡§Ç‡§ó‡§æ ‡§ó‡§Ç‡§ó‡§æ ‡§®‡§¶‡•Ä ‡§®‡§∞‡•á‡§®‡•ç‡§¶‡•ç‡§∞ ‡§Æ...,1,hi
1,‡§Ø‡§π ‡§ñ‡§¨‡§∞ ‡§Ü‡§®‡•á ‡§∏‡•á ‡§™‡§π‡§≤‡•á ‡§õ‡§µ‡§ø ‡§ï‡•ç‡§∞‡•á‡§°‡§ø‡§ü ‡§ú‡§∏‡•ç‡§ü‡§ø‡§® ‡§∏‡•Å‡§≤‡§ø‡§µ‡§æ‡§® ...,1,hi
2,‡§ó‡•Å‡§≤‡§æ‡§¨ ‡§ó‡•á‡§Ç‡§¶ ‡§µ‡§æ‡§≤ ‡§°‡•á ‡§®‡§æ‡§á‡§ü ‡§ü‡•á‡§∏‡•ç‡§ü ‡§Æ‡•à‡§ö ‡§ï‡§™‡•ç‡§§‡§æ ‡§µ‡§ø‡§∞‡§æ‡§ü ‡§ï...,0,hi
3,‡§â‡§§‡•ç‡§§‡§∞ ‡§ï‡•ã‡§∞‡§ø‡§Ø‡§æ ‡§∞‡•â‡§ï‡•á‡§ü ‡§™‡•ç‡§∞‡§ï‡•ç‡§∑‡•á‡§™‡§£ ‡§Ø‡•ã‡§ú‡§®‡§æ‡§è‡§Ç 71 0 15 0...,1,hi
4,‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§™‡§§‡§ø ‡§°‡•ã‡§®‡§æ‡§≤‡•ç‡§° ‡§ü‡•ç‡§∞‡§Æ‡•ç‡§™ ‡§î‡§∞ ‡§™‡•ç‡§∞‡§•‡§Æ ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§Æ‡•á‡§≤‡§æ‡§®...,0,hi


In [6]:
# Check label distribution
print("\nLabel Distribution:")
print(df['label'].value_counts())
print(f"\nLabel distribution (%):")
print(df['label'].value_counts(normalize=True) * 100)


Label Distribution:
label
0    51382
1    30581
Name: count, dtype: int64

Label distribution (%):
label
0    62.689262
1    37.310738
Name: proportion, dtype: float64


In [8]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 65570
Testing set size: 16393


In [None]:
# TF-IDF Vectorization
print("=" * 60)
print("TF-IDF VECTORIZATION")
print("=" * 60)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=20000,      # Limit to 20k most important features
    ngram_range=(1, 2),      # Use unigrams and bigrams
    min_df=5,                # Ignore terms appearing in < 5 documents
    max_df=0.8,              # Ignore terms appearing in > 80% of documents
    stop_words=None          # No stop words (dataset has Hindi + English)
)

# Fit on training data and transform both sets
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Display vectorization results
print(f"\nTraining set shape: {X_train_tfidf.shape}")
print(f"Test set shape: {X_test_tfidf.shape}")
print(f"Number of features extracted: {X_train_tfidf.shape[1]}")
print(f"Training set sparsity: {1 - (X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])):.2%}")
print("\n‚úÖ TF-IDF vectorization complete!")

In [None]:
# Train Logistic Regression Model
print("\n" + "=" * 60)
print("LOGISTIC REGRESSION MODEL TRAINING")
print("=" * 60)

# Initialize Logistic Regression
lr_model = LogisticRegression(
    max_iter=1000,           # Maximum iterations for convergence
    random_state=42,         # For reproducibility
    n_jobs=-1,               # Use all available processors
    C=1.0,                   # Regularization strength
    class_weight='balanced'  # Handle class imbalance
)

# Train the model
print("\nTraining Logistic Regression...")
lr_model.fit(X_train_tfidf, y_train)
print("‚úÖ Model training complete!")

# Make predictions
y_pred_lr = lr_model.predict(X_test_tfidf)
y_pred_proba_lr = lr_model.predict_proba(X_test_tfidf)

# Evaluate the model
print("\n" + "=" * 60)
print("LOGISTIC REGRESSION EVALUATION RESULTS")
print("=" * 60)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_lr)
print(f"\nüìä Overall Accuracy: {accuracy:.4f} ({accuracy:.2%})")

# Classification Report
print("\n" + "-" * 60)
print("CLASSIFICATION REPORT:")
print("-" * 60)
print(classification_report(y_test, y_pred_lr, target_names=['Real (0)', 'Fake (1)'], digits=4))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)
print("-" * 60)
print("CONFUSION MATRIX:")
print("-" * 60)
print(cm)

# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'], 
            yticklabels=['Real', 'Fake'],
            cbar_kws={'label': 'Count'})
plt.title('Logistic Regression - Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

# Save model and vectorizer
joblib.dump(lr_model, '../models/logistic_regression_tfidf.pkl')
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
print("\n‚úÖ Model and vectorizer saved successfully!")