<a href="https://colab.research.google.com/github/Mani-snillingur/text-sum/blob/dev1/Session_14_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Date: 23/12/2023

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# text preprocessing
def text_preprocessing(text):
  words = word_tokenize(text.lower())
  stop_words = set(stopwords.words('english'))
  words = [word for word in words if word.isalnum() and word not in stop_words]

  stemmer = PorterStemmer()
  words = [stemmer.stem(word) for word in words]

  return words

In [None]:
# text = "This is NLP session & Text-preprocessing is an essential step."
text = "This is NLP session & Text-preprocessing is an essential step. This is NLP session & Text-preprocessing is an essential step."
processed_text = text_preprocessing(text)

print("Original Text: ", text)
print("Processed Text: ", processed_text)

Original Text:  This is NLP session & Text-preprocessing is an essential step. This is NLP session & Text-preprocessing is an essential step.
Processed Text:  ['nlp', 'session', 'essenti', 'step', 'nlp', 'session', 'essenti', 'step']


In [None]:
from nltk.probability import FreqDist

In [None]:
fdist = FreqDist(processed_text)
print(fdist)

<FreqDist with 4 samples and 8 outcomes>


In [None]:
# Noise Removal
import re


def noise_remove(text):
  cln_txt = re.sub(r'[^a-zA-Z\s]', '', text)
  cln_txt = ' '.join(cln_txt.split())

  return cln_txt


text = "This is a @ Simple Sentence with some number 12345!"

cl_txt = noise_remove(text)
print(cl_txt)

This is a Simple Sentence with some number


In [None]:
# Lexicon Normalization
# 1. Stemming

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

word = "Playing"
stem = stemmer.stem(word)

print(stem)

play


In [None]:
# 2. Lemmatization
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

word = "better"
lemma = lemmatizer.lemmatize(word, pos='a')

print(lemma)

good


In [None]:
# Object Standardization
from datetime import datetime

def date_stand(text):
    date_pattern = re.compile(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b')
    match_date = date_pattern.findall(text)
    stander_txt = text

    for i in match_date:
      try:
        date_obj = datetime.strptime(i, '%Y-%m-%d')
        date_stand2 = date_obj.strftime('%Y-%m-%d')
        stander_txt =  stander_txt.replace(i, date_stand2)

      except ValueError:
        pass

    return stander_txt

text = "date is 2023/12/23, tommorow 24-12-2023, yeasterday 2023-12-22"

date_text = date_stand(text)

print(date_text)



date is 2023/12/23, tommorow 24-12-2023, yeasterday 2023-12-22


# Date: 24/12/2023

In [1]:
# Text to Features (Feature Engineering) -> Bag-of-Words (BoW)

from sklearn.feature_extraction.text import CountVectorizer

documents = ["This is a sample sentence.", "This sentence is the second part."]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

print("Bag of Words matrix:")
print(X.toarray())
print("Feature Names:", feature_names)

Bag of Words matrix:
[[1 0 1 0 1 0 1]
 [1 1 0 1 1 1 1]]
Feature Names: ['is' 'part' 'sample' 'second' 'sentence' 'the' 'this']


In [2]:
# Term Frequency-Inverse Document Frequency

from sklearn.feature_extraction.text import TfidfVectorizer

documents = ["This is a sample sentence.", "This sentence is the second part."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

feature_names = vectorizer.get_feature_names_out()

print("TF-IDF Matrix:")
print(X.toarray())
print("Feature Names:", feature_names)

TF-IDF Matrix:
[[0.44832087 0.         0.63009934 0.         0.44832087 0.
  0.44832087]
 [0.33471228 0.47042643 0.         0.47042643 0.33471228 0.47042643
  0.33471228]]
Feature Names: ['is' 'part' 'sample' 'second' 'sentence' 'the' 'this']


In [3]:
# Word Embeddings -> models -> Word2Vec, GloVe

import gensim.downloader as api

try:
    word2vec_model = api.load('word2vec-google-news-300')
except (FileNotFoundError, api.DownloadError):
    print("Downloading...")
    word2vec_model = api.load('word2vec-google-news-300')

# Specify the local path and filename to save the Word2Vec model
local_path = "/content/drive/MyDrive/Colab Notebooks/"
model_filename = "word2vec_google_news.model"
full_path = local_path + model_filename

# Save the Word2Vec model to the specified local path
word2vec_model.save(full_path)

print(f"Word2Vec model saved to {full_path}")

[====----------------------------------------------] 9.6% 159.3/1662.8MB downloaded

AttributeError: ignored

In [None]:
loaded_model = Word2Vec.load(full_path)

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

sentence = "Word embeddings are powerful tools for NLP."
tokens = word_tokenize(sentence)

word_embeddings = [loaded_model[word] for word in tokens if word in loaded_model]

print("Word Embeddings:")
print(word_embeddings)

In [4]:
# Syntactical Parsing

import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "The quick brown fox jumps over the lazy dog."
doc = nlp(sentence)

for token in doc:
    print(f"{token.text} --> {token.dep_} --> {token.head.text}")

The --> det --> fox
quick --> amod --> fox
brown --> amod --> fox
fox --> nsubj --> jumps
jumps --> ROOT --> jumps
over --> prep --> jumps
the --> det --> dog
lazy --> amod --> dog
dog --> pobj --> over
. --> punct --> jumps


In [5]:
# Dependency Grammer
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "The cat sat on the mat."

doc = nlp(sentence)

for token in doc:
    print(f"{token.text} --{token.dep_}--> {token.head.text}")

The --det--> cat
cat --nsubj--> sat
sat --ROOT--> sat
on --prep--> sat
the --det--> mat
mat --pobj--> on
. --punct--> sat


In [6]:
# POS Tagging
import spacy

nlp = spacy.load("en_core_web_sm")

sentence = "The quick brown fox jumps over the lazy dog."

doc = nlp(sentence)

for token in doc:
    print(f"{token.text}: {token.pos_}")

The: DET
quick: ADJ
brown: ADJ
fox: NOUN
jumps: VERB
over: ADP
the: DET
lazy: ADJ
dog: NOUN
.: PUNCT


In [7]:
# Entity Parsing/ NER
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Apple Inc. was founded by Steve Jobs in Cupertino. It became a major tech company."

doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

Apple Inc.: ORG
Steve Jobs: PERSON
Cupertino: GPE


In [8]:
# Topic Modelling
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import remove_stopwords

documents = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Topic modeling helps in discovering hidden topics within a collection of documents.",
    "Latent Dirichlet Allocation is a popular algorithm for topic modeling.",
    "Text data preprocessing involves tasks like tokenization and removing stop words."
]


preprocessed_documents = [remove_stopwords(doc.lower()) for doc in documents]
tokenized_documents = [doc.split() for doc in preprocessed_documents]
dictionary = corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)

topics = lda_model.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.052*"topic" + 0.052*"documents." + 0.052*"helps"')
(1, '0.047*"tasks" + 0.047*"text" + 0.047*"like"')


In [9]:
# N-Grams
def generate_ngrams(text, n):
    """
    Generate n-grams from a given text.
    """
    words = text.split()
    ngrams = zip(*[words[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

example_text = "This is an example sentence for n-gram generation."

bi_grams = generate_ngrams(example_text, 2)
print("Bi-grams:")
print(bi_grams)


tri_grams = generate_ngrams(example_text, 3)
print("\nTri-grams:")
print(tri_grams)

Bi-grams:
['This is', 'is an', 'an example', 'example sentence', 'sentence for', 'for n-gram', 'n-gram generation.']

Tri-grams:
['This is an', 'is an example', 'an example sentence', 'example sentence for', 'sentence for n-gram', 'for n-gram generation.']


# Text Classification

In [10]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [11]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [13]:
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

In [14]:
predictions = classifier.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.62


In [15]:
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=newsgroups.target_names))

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.61      0.25      0.36       151
           comp.graphics       0.48      0.75      0.58       202
 comp.os.ms-windows.misc       0.73      0.04      0.08       195
comp.sys.ibm.pc.hardware       0.53      0.73      0.62       183
   comp.sys.mac.hardware       0.86      0.58      0.69       205
          comp.windows.x       0.68      0.80      0.74       215
            misc.forsale       0.88      0.53      0.66       193
               rec.autos       0.87      0.63      0.73       196
         rec.motorcycles       0.49      0.58      0.53       168
      rec.sport.baseball       0.99      0.67      0.80       211
        rec.sport.hockey       0.92      0.80      0.86       198
               sci.crypt       0.59      0.77      0.67       201
         sci.electronics       0.84      0.49      0.62       202
                 sci.med       0.82      0.75      0

# Text Matching

In [16]:
def jaccard_similarity(str1, str2):
  set1 = set(str1.split())
  set2 = set(str2.split())
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))
  return intersection / union if union != 0 else 0

text1 = "The quick brown fox"
text2 = "A quick brown dog"

similarity_score = jaccard_similarity(text1, text2)

print(f"Jaccard Similarity: {similarity_score:.2f}")

Jaccard Similarity: 0.33


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text1 = "The quick brown fox"
text2 = "A quick brown dog"

vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text1, text2])

cosine_sim = cosine_similarity(X)

print(f"Cosine Similarity: {cosine_sim[0, 1]:.2f}")

Cosine Similarity: 0.58


In [18]:
# Levenshtein distance
!pip install distance

Collecting distance
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25l[?25hdone
  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=8d7652f3656d312e16e7bc33228c44947786446041fdd2bb5ef420a2ca6f3890
  Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3


In [19]:
import distance

text1 = "The quick brown fox"
text2 = "A quick brown dog"

levenshtein_distance = distance.levenshtein(text1, text2)

print(f"Levenshtein Distance: {levenshtein_distance}")

Levenshtein Distance: 5


# Flexible String Matching

In [20]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [21]:
from fuzzywuzzy import fuzz

text1 = "Google LLC."
text2 = "Google LLC."
text3 = "Apple Inc."

similarity_score_1_2 = fuzz.ratio(text1, text2)
similarity_score_1_3 = fuzz.ratio(text1, text3)

print(f"Similarity between '{text1}' and '{text2}': {similarity_score_1_2}%")
print(f"Similarity between '{text1}' and '{text3}': {similarity_score_1_3}%")

Similarity between 'Google LLC.' and 'Google LLC.': 100%
Similarity between 'Google LLC.' and 'Apple Inc.': 38%




# Text summarization

In [22]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

In [23]:
nlp = spacy.load("en_core_web_sm")

In [24]:
def preprocess_text(text):
    doc = nlp(text)
    return ' '.join(token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and not token.is_punct)



def extractive_summarization(text, num_sentences=5):

    preprocessed_text = preprocess_text(text)
    tokens = nlp(preprocessed_text)
    word_freq = Counter(tokens)
    sentence_scores = {sentence: sum(word_freq[word] for word in sentence) for sentence in tokens.sents}
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = ' '.join(str(sentence) for sentence in sorted(summary_sentences, key=lambda x: x.start))

    return summary

In [25]:
input_text = """
A supercomputer is a computer with a high level of performance as compared to a general-purpose computer.
The performance of a supercomputer is commonly measured in floating-point operations per second (FLOPS) instead of million instructions per second (MIPS).
Since 2017, supercomputers have existed which can perform over 1017 FLOPS (a hundred quadrillion FLOPS, 100 petaFLOPS or 100 PFLOPS).
[3] For comparison, a desktop computer has performance in the range of hundreds of gigaFLOPS (1011) to tens of teraFLOPS (1013).
[4][5] Since November 2017, all of the world's fastest 500 supercomputers run on Linux-based operating systems.
[6] Additional research is being conducted in the United States, the European Union, Taiwan, Japan, and China to build faster, more powerful and technologically superior exascale supercomputers.

Supercomputers play an important role in the field of computational science, and are used for a wide range of computationally intensive tasks in various fields, including quantum mechanics, weather forecasting, climate research, oil and gas exploration, molecular modeling (computing the structures and properties of chemical compounds, biological macromolecules, polymers, and crystals), and physical simulations (such as simulations of the early moments of the universe, airplane and spacecraft aerodynamics, the detonation of nuclear weapons, and nuclear fusion). They have been essential in the field of cryptanalysis.[8]

Supercomputers were introduced in the 1960s, and for several decades the fastest was made by Seymour Cray at Control Data Corporation (CDC), Cray Research and subsequent companies bearing his name or monogram. The first such machines were highly tuned conventional designs that ran more quickly than their more general-purpose contemporaries. Through the decade, increasing amounts of parallelism were added, with one to four processors being typical. In the 1970s, vector processors operating on large arrays of data came to dominate. A notable example is the highly successful Cray-1 of 1976. Vector computers remained the dominant design into the 1990s. From then until today, massively parallel supercomputers with tens of thousands of off-the-shelf processors became the norm.[9][10]
"""

summary = extractive_summarization(input_text)

print("Length of the original content: ", len(input_text))
print("\nSummary:")
print(summary)
print("Length of the summary content: ",len(summary))

Length of the original content:  2251

Summary:

 supercomputer computer high level performance compared general purpose computer 
 performance supercomputer commonly measured floating point operations second flops instead million instructions second mips 
 2017 supercomputers existed perform 1017 flops quadrillion flops 100 petaflops 100 pflops 
 3 comparison desktop computer performance range hundreds gigaflops 1011 tens teraflops 1013 
 4][5 november 2017 world fastest 500 supercomputers run linux based operating systems 
 6 additional research conducted united states european union taiwan japan china build faster powerful technologically superior exascale supercomputers 

 supercomputers play important role field computational science wide range computationally intensive tasks fields including quantum mechanics weather forecasting climate research oil gas exploration molecular modeling computing structures properties chemical compounds biological macromolecules polymers crystals ph

In [26]:
input_text = """
A supercomputer is a computer with a high level of performance as compared to a general-purpose computer.
The performance of a supercomputer is commonly measured in floating-point operations per second (FLOPS) instead of million instructions per second (MIPS).
Since 2017, supercomputers have existed which can perform over 1017 FLOPS (a hundred quadrillion FLOPS, 100 petaFLOPS or 100 PFLOPS).
[3] For comparison, a desktop computer has performance in the range of hundreds of gigaFLOPS (1011) to tens of teraFLOPS (1013).
[4][5] Since November 2017, all of the world's fastest 500 supercomputers run on Linux-based operating systems.
[6] Additional research is being conducted in the United States, the European Union, Taiwan, Japan, and China to build faster, more powerful and technologically superior exascale supercomputers.

Supercomputers play an important role in the field of computational science, and are used for a wide range of computationally intensive tasks in various fields, including quantum mechanics, weather forecasting, climate research, oil and gas exploration, molecular modeling (computing the structures and properties of chemical compounds, biological macromolecules, polymers, and crystals), and physical simulations (such as simulations of the early moments of the universe, airplane and spacecraft aerodynamics, the detonation of nuclear weapons, and nuclear fusion). They have been essential in the field of cryptanalysis.[8]
"""

summary = extractive_summarization(input_text)

print("Length of the original content: ", len(input_text))
print("\nSummary:")
print(summary)
print("Length of the summary content: ",len(summary))

Length of the original content:  1461

Summary:

 supercomputer computer high level performance compared general purpose computer 
 performance supercomputer commonly measured floating point operations second flops instead million instructions second mips 
 2017 supercomputers existed perform 1017 flops quadrillion flops 100 petaflops 100 pflops 
 3 comparison desktop computer performance range hundreds gigaflops 1011 tens teraflops 1013 
 4][5 november 2017 world fastest 500 supercomputers run linux based operating systems 
 6 additional research conducted united states european union taiwan japan china build faster powerful technologically superior exascale supercomputers 

 supercomputers play important role field computational science wide range computationally intensive tasks fields including quantum mechanics weather forecasting climate research oil gas exploration molecular modeling computing structures properties chemical compounds biological macromolecules polymers crystals ph