In [None]:
!pip install transformers
!pip install datasets
!pip install contractions
!pip install accelerate
!pip install bitsandbytes



In [None]:
huggingface_dataset_name = "cnn_dailymail"

In [None]:
from datasets import load_dataset

dataset = load_dataset(huggingface_dataset_name, "3.0.0")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [None]:
sample = dataset["train"][1]
print(f"""Article (total length: {len(sample["article"])}):""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])

Article (total length: 4051):
Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
import contractions
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocess_text(text) for text in X]

    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()

        # Remove punctuation and digits
        text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stop words
        tokens = [word for word in tokens if word not in self.stop_words]

        # Join the tokens back into a string
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

# Function for regex cleaning
class RegexCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.re_clean(text) for text in X]

    def re_clean(self, text):
        text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', ' ', text)
        text = re.sub(r'[_\-;%()|+&=*%:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        text = re.sub(r'\n',' ', text)
        text = re.sub(' est ',' ', text)
        text = re.sub(r'[?!]','.', text)
        return text

# Function to expand contractions
class ContractionsExpander(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.expand_contractions(text) for text in X]

    def expand_contractions(self, text):
        return contractions.fix(text)

# Function to remove short sentences
class ShortSentencesRemover(BaseEstimator, TransformerMixin):
    def __init__(self, min_length=5):
        self.min_length = min_length

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.remove_short_sentences(text) for text in X]

    def remove_short_sentences(self, text):
        sentences = text.split('.')
        cleaned_sentences = [sentence.strip()+'.' for sentence in sentences if len(sentence.split()) >= self.min_length]
        cleaned_text = ' '.join(cleaned_sentences)
        return cleaned_text

# Function to remove specified tags
class TagsRemover(BaseEstimator, TransformerMixin):
    def __init__(self, tags=['cnn', 'est']):
        self.tags = tags

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.remove_tags(text) for text in X]

    def remove_tags(self, text):
        for tag in self.tags:
            tag_pos = text.find(tag)
            if tag_pos != -1 and tag_pos < len(text)//10:
                text = text[tag_pos + len(tag):]
        return text

# Combine the transformers into a preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('regex_cleaner', RegexCleaner()),
    ('contractions_expander', ContractionsExpander()),
    ('text_preprocessor', TextPreprocessor()),
    ('short_sentences_remover', ShortSentencesRemover()),
    ('tags_remover', TagsRemover())
])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Apply preprocessing pipeline to the dataset
preprocessed_dataset = dataset.map(lambda example: {'article': preprocessing_pipeline.transform([example['article']])[0],
                                                    'highlights': preprocessing_pipeline.transform([example['highlights']])[0]})

In [None]:
preprocessed_dataset["train"][1]

{'article': ' correspondents share experiences covering news analyze stories behind events soledad brien takes users inside jail many inmates mentally ill inmate housed forgotten floor many mentally ill inmates housed miami trial miami florida cnn ninth floor miami dade pretrial detention facility dubbed forgotten floor inmates severe mental illnesses incarcerated ready appear court often face drug charges charges assaulting officer charges judge steven leifman says usually avoidable felonies says arrests often result confrontations police mentally ill people often told police arrive scene confrontation seems exacerbate illness become paranoid delusional less likely follow directions according leifman end ninth floor severely mentally disturbed getting real help jail toured jail leifman well known miami advocate justice mentally ill even though exactly welcomed open arms guards given permission shoot videotape tour floor go inside forgotten floor Â» first hard determine people prisoner

In [None]:
sample1 = preprocessed_dataset["train"][1]
print(f"""Article (total length: {len(sample1["article"])}):""")
print(sample1["article"][:500])
print(f'\nSummary (length: {len(sample1["highlights"])}):')
print(sample1["highlights"])

Article (total length: 2535):
 correspondents share experiences covering news analyze stories behind events soledad brien takes users inside jail many inmates mentally ill inmate housed forgotten floor many mentally ill inmates housed miami trial miami florida cnn ninth floor miami dade pretrial detention facility dubbed forgotten floor inmates severe mental illnesses incarcerated ready appear court often face drug charges charges assaulting officer charges judge steven leifman says usually avoidable felonies says arrests of

Summary (length: 193):
mentally ill inmates miami housed forgotten floor judge steven leifman says result avoidable felonies cnn tours facility patient shouts son president leifman says system unjust fighting change.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json

# Save each split of the DatasetDict separately
for split in preprocessed_dataset.keys():
    # Convert split to a list of dictionaries
    split_data_list = preprocessed_dataset[split].to_dict()

    # Define the file path for the split
    file_path = f'/content/drive/My Drive/capstone_preprocessedData/preprocessed_dataset_{split}.json'

    # Save split as a JSON file
    with open(file_path, 'w') as json_file:
        json.dump(split_data_list, json_file)

    print(f"Preprocessed {split} dataset saved successfully.")


Preprocessed train dataset saved successfully.
Preprocessed validation dataset saved successfully.
Preprocessed test dataset saved successfully.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Define custom transformer for TF-IDF feature extraction
class ImportantWordsExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, max_features=10):
        self.tfidf_vectorizer = TfidfVectorizer(max_features=max_features)

    def fit(self, X, y=None):
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        # Transform articles into TF-IDF features
        tfidf_matrix = self.tfidf_vectorizer.transform(X)
        # Get feature names (important words) from TF-IDF vectorizer
        feature_names = np.array(self.tfidf_vectorizer.get_feature_names_out())
        # Get indices of top TF-IDF features for each article
        top_feature_indices = tfidf_matrix.toarray().argsort(axis=1)[:, ::-1][:, :self.tfidf_vectorizer.max_features]
        # Extract top TF-IDF features (important words) for each article
        important_words = [feature_names[indices] for indices in top_feature_indices]
        return important_words

# Define a pipeline for extracting important words
important_words_pipeline_train = Pipeline([
    ('important_words_extractor', ImportantWordsExtractor())
])

# Fit the pipeline on the training dataset
print("Fitting important words extraction pipeline on the training dataset...")
important_words_pipeline_train.fit(preprocessed_dataset['train']['article'])
print("Pipeline fitting completed.")

# Apply important words extraction pipeline to the train dataset
print("Fitting and transforming training dataset...")
important_words_train = important_words_pipeline_train.fit_transform(preprocessed_dataset['train']['article'])
print("Done.")


Fitting important words extraction pipeline on the training dataset...
Pipeline fitting completed.
Fitting and transforming training dataset...
Done.


In [None]:
sample1 = preprocessed_dataset["train"][1]
print(f"""Article (total length: {len(sample1["article"])}):""")
print(sample1["article"][:500])
print(f'\nSummary (length: {len(sample1["highlights"])}):')
print(sample1["highlights"])

Article (total length: 2535):
 correspondents share experiences covering news analyze stories behind events soledad brien takes users inside jail many inmates mentally ill inmate housed forgotten floor many mentally ill inmates housed miami trial miami florida cnn ninth floor miami dade pretrial detention facility dubbed forgotten floor inmates severe mental illnesses incarcerated ready appear court often face drug charges charges assaulting officer charges judge steven leifman says usually avoidable felonies says arrests of

Summary (length: 193):
mentally ill inmates miami housed forgotten floor judge steven leifman says result avoidable felonies cnn tours facility patient shouts son president leifman says system unjust fighting change.


In [None]:
sample2 = preprocessed_dataset["train"][10]
print(f"""Article (total length: {len(sample2["article"])}):""")
print(sample2["article"][:500])
print(f'\nSummary (length: {len(sample2["highlights"])}):')
print(sample2["highlights"])

Article (total length: 3226):
 awaits crucial progress report iraq president bush try put twist comparisons war vietnam invoking historical lessons conflict argue pulling president bush pauses tuesday news conference north american leaders summit canada wednesday kansas city missouri bush tell members veterans foreign wars people argued real problem america presence would withdraw killing would end according speech excerpts released tuesday white house three decades later legitimate debate got vietnam war left bush say whate

Summary (length: 170):
president bush address veterans foreign wars wednesday bush say withdrawing vietnam emboldened today terrorists speech latest white house attempt try reframe debate iraq.


In [None]:
import random

# Define the size of the subset (e.g., 1000 articles)
subset_size = 1000

# Sample a subset of articles
random.seed(42)  # Set a seed for reproducibility
subset_indices = random.sample(range(len(preprocessed_dataset['train']['article'])), subset_size)
subset_articles = [preprocessed_dataset['train']['article'][i] for i in subset_indices]

# Tokenize each document in the subset
tokenized_data = [doc.split() for doc in subset_articles]

# Create dictionary and corpus
dictionary = corpora.Dictionary(tokenized_data)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]

# Train LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

# Print topics
pprint.pprint(lda_model.print_topics())

NameError: name 'corpora' is not defined

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomPromptGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, max_summary_length=50):
        self.max_summary_length = max_summary_length

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        custom_prompts = []
        for article, important_words in X:
            custom_prompt = self.generate_custom_prompt(article, important_words)
            custom_prompts.append(custom_prompt)
        return custom_prompts

    def generate_custom_prompt(self, article, important_words):
        # Define a template sentence structure for the prompt
        prompt_template = "Summarize the article using the following important words: {}. Limit your summary to {} words."

        # Combine the important words into a comma-separated string
        important_words_str = ", ".join(important_words)

        # Construct the custom prompt by inserting the important words and the summary length limit into the template
        custom_prompt = prompt_template.format(important_words_str, self.max_summary_length)

        return custom_prompt

# Create a pipeline for generating custom prompts
custom_prompt_pipeline = Pipeline([
    ('custom_prompt_generator', CustomPromptGenerator())
])

# Apply custom prompt generation pipeline to the preprocessed dataset
custom_prompts_train = custom_prompt_pipeline.transform(zip(preprocessed_dataset['train']['article'], important_words_train))

# Now, custom_prompts_train, custom_prompts_validation, and custom_prompts_test contain the generated custom prompts
custom_prompts_train[4]

'Summarize the article using the following important words: said, also, would, two, last, year, one, time, people, new. Limit your summary to 50 words.'

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U accelerate

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.base import BaseEstimator, TransformerMixin
import bitsandbytes as bnb


class LLAMASummarizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_id="NousResearch/Llama-2-7b-hf"):
        self.model_id = model_id
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id, quantization_config=self.bnb_config, device_map="auto")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        generated_summaries = []
        for custom_prompt in X:
            generated_summary = self.generate_summary(custom_prompt)
            generated_summaries.append(generated_summary)
        return generated_summaries

    def generate_summary(self, custom_prompt):
        inputs = self.tokenizer(custom_prompt, return_tensors='pt')
        output = self.model.generate(inputs["input_ids"], max_new_tokens=100)[0]
        decoded_output = self.tokenizer.decode(output, skip_special_tokens=True)
        return decoded_output

# Create a pipeline for LLAMA summarization
llama_summarizer_pipeline = Pipeline([
    ('custom_prompt_generator', CustomPromptGenerator()),
    ('llama_summarizer', LLAMASummarizer())
])

# Apply the LLAMA summarization pipeline to the preprocessed dataset
generated_summaries_train = llama_summarizer_pipeline.transform(zip(preprocessed_dataset['train']['article'], important_words_train))

# Now, generated_summaries_train, generated_summaries_validation, and generated_summaries_test contain the summaries generated by LLAMA-2 using the custom prompts


OSError: libcudart.so.11.0: cannot open shared object file: No such file or directory

TRYING TO RUN LLAMA2

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id =  "NousResearch/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

ZERO SHOT PROMPT TRY

In [None]:
index = 2

dialogue = preprocessed_dataset['train']['article'][index]
summary = preprocessed_dataset['train']['highlights'][index]

prompt = f"""
Summarize the following conversation.

### Input:
{dialogue}

### Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')
