In [2]:
! pip install spacy transformers datasets scikit-learn rouge-score nltk spacy-transformers
! python -m spacy download en_core_web_sm
! python -m spacy download en_core_web_trf



Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


  _torch_pytree._register_pytree_node(


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     --------------------------------------- 3.1/457.4 MB 16.8 MB/s eta 0:00:28
      -------------------------------------- 6.8/457.4 MB 16.1 MB/s eta 0:00:28
      ------------------------------------- 10.7/457.4 MB 16.8 MB/s eta 0:00:27
     - ------------------------------------ 14.7/457.4 MB 17.8 MB/s eta 0:00:25
     - ------------------------------------ 18.6/457.4 MB 17.5 MB/s eta 0:00:26
     - ------------------------------------ 22.3/457.4 MB 17.6 MB/s eta 0:00:25
     -- ----------------------------------- 26.2/457.4 MB 17.7 MB/s eta 0:00:25
     -- ----------------------------------- 30.7/457.4 MB 18.0 MB/s eta 0:00:24
     -- ----------------------------------- 35.4/457.4 MB 18.6 MB/s eta 0:00:23
     --- ------------------

  _torch_pytree._register_pytree_node(


In [3]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load IMDb dataset
dataset = load_dataset("imdb")

# Split dataset into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset['train']['text'], dataset['train']['label'], test_size=0.2, random_state=42
)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import spacy

# Load spaCy model with only necessary components
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])

def preprocess_texts(texts):
    processed_texts = []
    # Use batch processing and multithreading
    for doc in nlp.pipe(texts, batch_size=50, n_process=4):  # Adjust n_process based on your CPU
        processed_texts.append(
            " ".join([token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha])
        )
    return processed_texts

# Process the datasets
train_texts = preprocess_texts(train_texts)
test_texts = preprocess_texts(test_texts)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# BoW vectorization
vectorizer = CountVectorizer(max_features=1000)
train_bow = vectorizer.fit_transform(train_texts)
test_bow = vectorizer.transform(test_texts)

# Logistic Regression for classification
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(train_bow, train_labels)
bow_predictions = clf_bow.predict(test_bow)

print("BoW Accuracy:", accuracy_score(test_labels, bow_predictions))


In [None]:
import gensim.downloader as api
import numpy as np

# Load pre-trained GloVe embeddings
glove_vectors = api.load("glove-wiki-gigaword-50")

def embed_with_glove(texts):
    embeddings = []
    for text in texts:
        words = text.split()
        word_embeddings = [glove_vectors[word] for word in words if word in glove_vectors]
        if word_embeddings:
            embeddings.append(np.mean(word_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(50))  # Handle texts with no valid words
    return np.array(embeddings)

train_glove = embed_with_glove(train_texts)
test_glove = embed_with_glove(test_texts)

# Logistic Regression for classification
clf_glove = LogisticRegression(max_iter=1000)
clf_glove.fit(train_glove, train_labels)
glove_predictions = clf_glove.predict(test_glove)

print("GloVe Accuracy:", accuracy_score(test_labels, glove_predictions))


In [None]:
nlp_trf = spacy.load("en_core_web_trf")

def embed_with_transformer(texts):
    return [nlp_trf(text).vector for text in texts]

train_transformer = embed_with_transformer(train_texts)
test_transformer = embed_with_transformer(test_texts)

# Logistic Regression for classification
clf_transformer = LogisticRegression(max_iter=1000)
clf_transformer.fit(train_transformer, train_labels)
transformer_predictions = clf_transformer.predict(test_transformer)

print("Transformer-Based Accuracy:", accuracy_score(test_labels, transformer_predictions))


In [None]:
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarize a sample text
sample_text = (
    "SpaceX designs, develops, and manufactures space launch vehicles, spacecraft, and satellite systems. "
    "Led by Elon Musk, SpaceX has launched a number of historic missions, including the first privately-funded "
    "craft to reach orbit and the first manned mission by a private company to the International Space Station."
)
summary = summarizer(sample_text, max_length=50, min_length=25, do_sample=False)
print("Summary:", summary[0]["summary_text"])


In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

# Reference and generated summaries
reference_summary = "SpaceX develops space launch vehicles and spacecraft. It is led by Elon Musk and achieved historic milestones."
generated_summary = summary[0]["summary_text"]

# ROUGE Evaluation
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(reference_summary, generated_summary)
print("ROUGE Scores:", rouge_scores)

# BLEU Evaluation
bleu_score = sentence_bleu([reference_summary.split()], generated_summary.split())
print("BLEU Score:", bleu_score)


In [None]:
import matplotlib.pyplot as plt

# Accuracies
accuracies = [
    accuracy_score(test_labels, bow_predictions),
    accuracy_score(test_labels, glove_predictions),
    accuracy_score(test_labels, transformer_predictions)
]

approaches = ["BoW", "GloVe", "Transformer"]

# Bar Plot
plt.bar(approaches, accuracies, color=["blue", "green", "orange"])
plt.title("Accuracy Comparison")
plt.xlabel("Approach")
plt.ylabel("Accuracy")
plt.show()
