# Natural Language Processing 

## Load and Explore the Dataset

In [60]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer


from nltk.tokenize import sent_tokenize

In [61]:
import nltk

from nltk.corpus import gutenberg

# Download the Gutenberg corpus (only needed once)
nltk.download('gutenberg')

raw = gutenberg.raw('carroll-alice.txt')
print(raw[:300])

[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into the
book her sister was reading, but it had no pictures or conversatio


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


## Text Tokenization

In [62]:
# Write a function to count sentences
def count_sentences(text):
    sent_count = 0

    for char in text:
        if char in '.!?':  # Handle multiple sentence-ending punctuation marks
            sent_count += 1

    return sent_count

sent_count = count_sentences(raw)
print(f'There are {sent_count} sentences in this book')

There are 1640 sentences in this book


In [63]:
nltk.download('punkt')

# Tokenize sentences
from nltk.tokenize import word_tokenize

token_words = word_tokenize(raw)
print(token_words[:40])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['[', 'Alice', "'s", 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', ']', 'CHAPTER', 'I', '.', 'Down', 'the', 'Rabbit-Hole', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':']


In [64]:
import re
## Type solution here ##
# I decided to keep new lines
# Define a function to split text into chapters based on some pattern

def split_into_chapters(text):
    # Example pattern for chapters (may need adjustment based on actual text format)
    chapter_pattern = re.compile(r'\nCHAPTER +')
    chapters = re.split(chapter_pattern, text)

    # Remove empty chapters if any
    chapters = [chapter.strip() for chapter in chapters if chapter.strip()]

    return chapters

chapters = split_into_chapters(raw)

# Store chapters in a dictionary with chapter number or title as key
chapter_dict = {i+1: chapter for i, chapter in enumerate(chapters)}

# Print the number of chapters
print(f"Number of chapters: {len(chapter_dict)}")




Number of chapters: 12


In [65]:
## Provided this Cell  ##

import pandas as pd
import numpy as np
import csv
import re
import os

MODEL_PATH= '/Users/yerdenovagulnaz/Downloads/EP_models/'
os.environ['HF_HOME'] = MODEL_PATH  # before import transformers

import transformers
from transformers import pipeline

# filter warnings
from warnings import simplefilter
simplefilter(action='ignore', category=UserWarning)

print(f'transformers version= {transformers.__version__}')

transformers version= 4.42.4


In [66]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer_qa = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model_qa = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

# Use a pipeline as a high-level helper
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model=model_qa, tokenizer=tokenizer_qa, device=0)

## LLM- Masked  - Question-Answer

In [67]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load the tokenizer and model
tokenizer_m = BertTokenizer.from_pretrained('bert-base-uncased')
model_m = BertForMaskedLM.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [68]:
text = "Alice in [MASK]."
input_ids = tokenizer_m.encode(text, return_tensors="pt")
mask_token_index = torch.where(input_ids == tokenizer_m.mask_token_id)[1]

# Predict the masked word
with torch.no_grad():
    output = model_m(input_ids)
    predictions = output.logits[0, mask_token_index, :]

# Extract the predicted token ID for the masked token
predicted_index = torch.argmax(predictions).item()
predicted_token = tokenizer_m.decode([predicted_index])

print(f"The most probable word after 'Alice in' is: {predicted_token}")


The most probable word after 'Alice in' is: wonderland


 I am using masked language model (MLM) approach which seems to be the most reasonable for this task. The approach is to mask the word that needs to be predicted, then use the model to find it.

The answer is correct, even without fine tunning the model

## Large Language Model - Question-Answer

In [69]:
# Write a function to split the corpus into chunks that will be suitable for LLM
def split_into_chunks(text, chunk_size, overlap_size):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap_size):
        chunks.append(' '.join(words[i: i + chunk_size]))
    return chunks

In [70]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, pipeline

# Parameters
chunk_size = 500
overlap_size = 20

# Split the text into chunks with overlap
chunks = split_into_chunks(raw, chunk_size, overlap_size)

# Function to find the best answer from all the chunks
def get_best_answer(question, chunks):
    answers = []
    scores = []

    for chunk in chunks:
        result = qa_pipeline(question=question, context=chunk)
        answers.append(result['answer'])
        scores.append(result['score'])

    # Rank answers by score
    if scores:
        best_index = scores.index(max(scores))
        return answers[best_index]

    return "No answer found"

# List of questions
questions = [
    "What is Rabbit's catchphrase?",
    "What makes Alice shrink?",
    "What is the Queen's full name?",
    "What game does Alice play?"
]

# Get answers for each question
answers = {q: get_best_answer(q, chunks) for q in questions}

# Print the answers
for question, answer in answers.items():
    print(f"Q: {question}\nA: {answer}\n")


Q: What is Rabbit's catchphrase?
A: UNimportant

Q: What makes Alice shrink?
A: the fan she was holding

Q: What is the Queen's full name?
A: The Queen of Hearts

Q: What game does Alice play?
A: croquet



In [71]:
# Define a pattern to match the White Rabbit's dialogue
pattern = re.compile(
    r'\bRabbit\b.*?(?:said|cried|exclaimed|asked|muttered|remarked|called|replied|added|begged|whispered)\b.*?(?:"[^"]*"|\'[^\']*\')',
    re.IGNORECASE | re.DOTALL
)

# Find all matches in the raw text
matches = pattern.findall(raw)

# Join all matches into a single string with each dialogue on a new line
dialogues = "\n".join(matches)

# Split the text into chunks with overlap
chunks_r = split_into_chunks(dialogues, chunk_size, overlap_size)

answer = get_best_answer("What is Rabbit's saying?", chunks_r)
print(f"Q: What is Rabbit's catchphrase?\nA: {answer}\n")

# The answer is not correct, we did not improved it using more specific dataset


Q: What is Rabbit's catchphrase?
A: 'Silence in the court!'



In [72]:
# Find all sentences containing word shrink from raw to improve the answer to What causes Alice to shrink? question

sentences = sent_tokenize(raw)
shrink_sentences = [sentence for sentence in sentences if 'shrink' in sentence.lower()]

answer = get_best_answer("What causes Alice to shrink?", shrink_sentences)
print(f"Q: What makes Alice shrink??\nA: {answer}\n")
# The answer did not improve

Q: What makes Alice shrink??
A: the fan she was holding



In [73]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-cased-distilled-squad")

qa_pipeline_2 = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)

In [74]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, pipeline

# Parameters
chunk_size = 500
overlap_size = 20

# Split the text into chunks with overlap
chunks = split_into_chunks(raw, chunk_size, overlap_size)

# Function to find the best answer from all the chunks
def get_best_answer(question, chunks):
    answers = []
    scores = []

    for chunk in chunks:
        result = qa_pipeline_2(question=question, context=chunk)
        answers.append(result['answer'])
        scores.append(result['score'])

    # Rank answers by score
    if scores:
        best_index = scores.index(max(scores))
        return answers[best_index]

    return "No answer found"

# List of questions
questions = [
    "What is Rabbit's catchphrase?",
    "What makes Alice shrink?",
    "What is the Queen's full name?",
    "What game does Alice play?"
]

# Get answers for each question
answers = {q: get_best_answer(q, chunks) for q in questions}

# Print the answers
for question, answer in answers.items():
    print(f"Q: {question}\nA: {answer}\n")


Q: What is Rabbit's catchphrase?
A: 'Your Majesty must cross-examine THIS witness

Q: What makes Alice shrink?
A: sneezing

Q: What is the Queen's full name?
A: Alice

Q: What game does Alice play?
A: croquet



## LLM - Question-Answer

In [75]:
## Type solution here ##
answer = qa_pipeline("Who wrote Alice's Adventures in Wonderland?", chapters[0])
print(f"Q: Who wrote Alice's Adventures in Wonderland?\nA: {answer['answer']}\n")

# A chunk of an article from wikipedia
context = "Johnny Depp as Tarrant Hightopp / Mad Hatter:[8] Wasikowska said that the characters 'both feel like outsiders and feel alone in their separate worlds, and have a special bond and friendship.'[9][10] Burton explained that Depp 'tried to find a grounding to the character … as opposed to just being mad.'[11] Burton also said that '[i]n a lot of versions it's a very one-note kind of character and you know [Depp's] goal was to try and bring out a human side to the strangeness of the character.'[11] The orange hair is an allusion to the mercury poisoning suffered by hatters who used mercury to cure felt; Depp believes that the character 'was poisoned … and it was coming out through his hair, through his fingernails and eyes'.[12] Depp and Burton decided that the Hatter's clothes, skin, hair, personality and accent would change throughout the film to reflect his emotions.[13] In an interview with Depp, the character was paralleled to 'a mood ring, [as] his emotions are very close to the surface'.[14] The Hatter is 'made up of different people and their extreme sides', with a gentle voice much like the character's creator Lewis Carroll reflecting the lighter personality and with a Scottish Glaswegian accent (which Depp modeled after Gregor Fisher's Rab C. Nesbitt character) reflecting a darker, more dangerous personality.[15] Illusionary dancer David 'Elsewhere' Bernal doubled for Depp during the 'Futterwacken' sequence near the end of the film.[16]"
answer_1 = qa_pipeline("Who played the Mad Hatter in Tim Burton's film version of Alice's Adventures in Wonderland?", context)
print(f"Q: Who played the Mad Hatter in Tim Burton's film version of Alice's Adventures in Wonderland?\nA: {answer_1['answer']}\n")



Q: Who wrote Alice's Adventures in Wonderland?
A: Lewis Carroll

Q: Who played the Mad Hatter in Tim Burton's film version of Alice's Adventures in Wonderland?
A: Johnny Depp



## Sentiment Analysis

In [76]:
## Type solution here ##

# Function to split text into chunks of a given size with whole sentences
def split_into_chunks_2(text, max_length=512, tokenizer=None):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    chunks = []
    chunk = ""

    for sentence in sentences:
        # Tokenize and check if adding this sentence exceeds the max length
        tokenized_chunk = tokenizer.encode(chunk + " " + sentence, truncation=False)
        if len(tokenized_chunk) > max_length:
            if chunk:
                chunks.append(chunk)
            chunk = sentence
        else:
            chunk += " " + sentence

    if chunk:
        chunks.append(chunk)

    return chunks

In [77]:
from collections import Counter

# Load the sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", device=0)

# Predict sentiment for each chapter and print all sentiments with hard voting
for chapter_number, chapter_text in chapter_dict.items():
    # Split the chapter into smaller chunks
    chunks = split_into_chunks_2(chapter_text, max_length=512, tokenizer=sentiment_pipeline.tokenizer)

    # Collect sentiments for each chunk
    chunk_sentiments = []

    for i, chunk in enumerate(chunks):
        result = sentiment_pipeline(chunk)
        sentiment_label = result[0]['label']
        chunk_sentiments.append(sentiment_label)
        print(f"  Chunk {i+1}: Sentiment: {sentiment_label}, Score: {result[0]['score']:.4f}")

    # Perform hard voting to determine the overall chapter sentiment
    sentiment_counter = Counter(chunk_sentiments)
    most_common_sentiment, _ = sentiment_counter.most_common(1)[0]

    print(f"Chapter {chapter_number} overall sentiment: {most_common_sentiment}")
    print()  # Print a newline for better readability

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  Chunk 1: Sentiment: neutral, Score: 0.8965
Chapter 1 overall sentiment: neutral

  Chunk 1: Sentiment: neutral, Score: 0.6451
  Chunk 2: Sentiment: neutral, Score: 0.8079
  Chunk 3: Sentiment: neutral, Score: 0.6030
  Chunk 4: Sentiment: neutral, Score: 0.5376
  Chunk 5: Sentiment: neutral, Score: 0.5231
  Chunk 6: Sentiment: negative, Score: 0.5252
  Chunk 7: Sentiment: neutral, Score: 0.7869
Chapter 2 overall sentiment: neutral

  Chunk 1: Sentiment: neutral, Score: 0.5201
  Chunk 2: Sentiment: neutral, Score: 0.7270
  Chunk 3: Sentiment: negative, Score: 0.5061
  Chunk 4: Sentiment: neutral, Score: 0.7401
  Chunk 5: Sentiment: neutral, Score: 0.6866
  Chunk 6: Sentiment: negative, Score: 0.5637
Chapter 3 overall sentiment: neutral

  Chunk 1: Sentiment: neutral, Score: 0.8181
  Chunk 2: Sentiment: neutral, Score: 0.8285
  Chunk 3: Sentiment: neutral, Score: 0.7266
  Chunk 4: Sentiment: negative, Score: 0.4905
  Chunk 5: Sentiment: neutral, Score: 0.6108
  Chunk 6: Sentiment: neutr

## Summarization

In [78]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

#Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Create a summarization pipeline
summarizer = pipeline("summarization", model=model, max_length=100, tokenizer=tokenizer, device=0)

In [79]:
#Split the long text into chunks
chunks = split_into_chunks_2(raw, max_length=1024, tokenizer=tokenizer)

# Predict for each chunk
results = [summarizer(chunk) for chunk in chunks]

# Print the results for each chunk
for i, result in enumerate(results):
    print(f"Chunk {i+1}:")
    print(result[0]['summary_text'])  # Adjust based on the pipeline task
    print()


Chunk 1:
Alice fell down a rabbit-hole after a White Rabbit with pink eyes. She had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it. 'I wonder how many miles I've fallen by this time?' she said aloud. 'How funny it'll seem among the people that I walk with their heads downward!'

Chunk 2:
Alice was walking hand in hand with Dinah, and saying to her very earnestly, 'Now, Dinah,. tell me the truth: did you ever eat a bat?' When suddenly, thump! thumps! down she came upon a heap of sticks and dry leaves, and the fall was over. Alice was not a bit hurt, and she jumped up on to her feet in a moment. But it was all dark overhead; before her was another long passage, and

Chunk 3:
Alice was very fond of pretending to be two people. She generally gave herself very good advice, (though she very seldom followed it) and sometimes scolded herself so severely as to bring tears into her eyes. She once tried to box her own ears for having                cheated