In [None]:
import pandas as pd
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import pickle

# Load dataset
df = pd.read_csv("synthetic_summarization_dataset_3000.csv")
df = df.dropna(subset=["domain", "text"])

X = df["text"].astype(str)
y = df["domain"].astype(str)

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Domain Classifier Pipeline
domain_clf = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1, 2))),
    ("clf", MultinomialNB()),
])

domain_clf.fit(X_train, y_train)

# Evaluate classifier
y_pred = domain_clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Summarizer
summarizer = TextRankSummarizer()

# Wrapper object to store in pickle
nlp_model = {
    "domain_classifier": domain_clf,
    "summarizer": summarizer,
}

# Save to pickle
with open("nlp_multidim.pkl", "wb") as f:
    pickle.dump(nlp_model, f)

print("✅ Model saved as nlp_multidim.pkl")


              precision    recall  f1-score   support

    business       1.00      1.00      1.00        60
     culture       1.00      1.00      1.00        63
   education       1.00      1.00      1.00        55
 environment       1.00      1.00      1.00        63
     finance       1.00      1.00      1.00        63
      health       1.00      1.00      1.00        58
     science       1.00      1.00      1.00        56
     society       1.00      1.00      1.00        61
      sports       1.00      1.00      1.00        58
  technology       1.00      1.00      1.00        63

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600

✅ Model saved as nlp_multidim.pkl
