# Prerequisites Lab 2

### Reading from json data

In [1]:
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

Download necessary nltk resources

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
def extract_titles(json_data):
    '''
    Extracts book titles from the given JSON data
    '''
    book_titles=[]
    for book in json_data:
        book_titles.append(book['title'])
    return book_titles

In [27]:
def preprocess_text(text):
    '''
    Pre-processes the text: tokenization, stop word removal, stemming and lemmatization
    '''
    tokens=[word.lower() for word in word_tokenize(text)]
    print(f"Tokenized words:{tokens}\n")
    stop_words=set(stopwords.words('english'))
    filtered_tokens=[word for word in tokens if word.isalnum() and word not in stop_words]
    print(f"After stop word removal:{filtered_tokens}\n")
    stemmer=PorterStemmer()
    stemmed_tokens=[stemmer.stem(word) for word in filtered_tokens]
    print(f"After stemming:{stemmed_tokens}\n")
    lemmatizer=WordNetLemmatizer()
    lemmatized_tokens=[lemmatizer.lemmatize(word) for word in stemmed_tokens]
    print(f"After Lemmatization:{lemmatized_tokens}\n")
    return lemmatized_tokens

In [53]:
def calculate_word_probabilites(tokens):
    '''
        Calculates word frequencies and probabilites
    '''
    total_words=len(tokens)
    word_counts=Counter(tokens)
    word_probabilites={word:count/total_words for word,count in word_counts.items()}
    return word_counts,word_probabilites

In [54]:
json_file='books.json'
try:
    with open(json_file,'r') as file:
        data=json.load(file)
except Exception as e:
    print(f"Error reading JSON file : {e}")
for i in data:
    print(i)

{'author': 'Chinua Achebe', 'country': 'Nigeria', 'imageLink': 'images/things-fall-apart.jpg', 'language': 'English', 'link': 'https://en.wikipedia.org/wiki/Things_Fall_Apart\n', 'pages': 209, 'title': 'Things Fall Apart', 'year': 1958}
{'author': 'Hans Christian Andersen', 'country': 'Denmark', 'imageLink': 'images/fairy-tales.jpg', 'language': 'Danish', 'link': 'https://en.wikipedia.org/wiki/Fairy_Tales_Told_for_Children._First_Collection.\n', 'pages': 784, 'title': 'Fairy tales', 'year': 1836}
{'author': 'Dante Alighieri', 'country': 'Italy', 'imageLink': 'images/the-divine-comedy.jpg', 'language': 'Italian', 'link': 'https://en.wikipedia.org/wiki/Divine_Comedy\n', 'pages': 928, 'title': 'The Divine Comedy', 'year': 1315}
{'author': 'Unknown', 'country': 'Sumer and Akkadian Empire', 'imageLink': 'images/the-epic-of-gilgamesh.jpg', 'language': 'Akkadian', 'link': 'https://en.wikipedia.org/wiki/Epic_of_Gilgamesh\n', 'pages': 160, 'title': 'The Epic Of Gilgamesh', 'year': -1700}
{'auth

In [55]:
book_titles=extract_titles(data)
print(f"Extracted book titles:{book_titles}")

Extracted book titles:['Things Fall Apart', 'Fairy tales', 'The Divine Comedy', 'The Epic Of Gilgamesh', 'The Book Of Job', 'One Thousand and One Nights', "Njál's Saga", 'Pride and Prejudice', 'Le Père Goriot', 'Molloy, Malone Dies, The Unnamable, the trilogy', 'The Decameron', 'Ficciones', 'Wuthering Heights', 'The Stranger', 'Poems', 'Journey to the End of the Night', 'Don Quijote De La Mancha', 'The Canterbury Tales', 'Stories', 'Nostromo', 'Great Expectations', 'Jacques the Fatalist', 'Berlin Alexanderplatz', 'Crime and Punishment', 'The Idiot', 'The Possessed', 'The Brothers Karamazov', 'Middlemarch', 'Invisible Man', 'Medea', 'Absalom, Absalom!', 'The Sound and the Fury', 'Madame Bovary', 'Sentimental Education', 'Gypsy Ballads', 'One Hundred Years of Solitude', 'Love in the Time of Cholera', 'Faust', 'Dead Souls', 'The Tin Drum', 'The Devil to Pay in the Backlands', 'Hunger', 'The Old Man and the Sea', 'Iliad', 'Odyssey', "A Doll's House", 'Ulysses', 'Stories', 'The Trial', 'The

In [58]:
#combining all titles intpo a single string for preprocessing
combined_text=' '.join(book_titles)
#preprocess the text
tokens=preprocess_text(combined_text)
#calculate word counts and probabilites
word_counts, word_probabilities = calculate_word_probabilites(tokens)
print("\nWord counts\n")
print(word_counts)

print('\nWord Probabilites\n')
for word,prob in word_probabilities.items():
    print(f"{word}:{prob:.4f}")

Tokenized words:['things', 'fall', 'apart', 'fairy', 'tales', 'the', 'divine', 'comedy', 'the', 'epic', 'of', 'gilgamesh', 'the', 'book', 'of', 'job', 'one', 'thousand', 'and', 'one', 'nights', 'njál', "'s", 'saga', 'pride', 'and', 'prejudice', 'le', 'père', 'goriot', 'molloy', ',', 'malone', 'dies', ',', 'the', 'unnamable', ',', 'the', 'trilogy', 'the', 'decameron', 'ficciones', 'wuthering', 'heights', 'the', 'stranger', 'poems', 'journey', 'to', 'the', 'end', 'of', 'the', 'night', 'don', 'quijote', 'de', 'la', 'mancha', 'the', 'canterbury', 'tales', 'stories', 'nostromo', 'great', 'expectations', 'jacques', 'the', 'fatalist', 'berlin', 'alexanderplatz', 'crime', 'and', 'punishment', 'the', 'idiot', 'the', 'possessed', 'the', 'brothers', 'karamazov', 'middlemarch', 'invisible', 'man', 'medea', 'absalom', ',', 'absalom', '!', 'the', 'sound', 'and', 'the', 'fury', 'madame', 'bovary', 'sentimental', 'education', 'gypsy', 'ballads', 'one', 'hundred', 'years', 'of', 'solitude', 'love', 'in

### Reading from csv

In [61]:
import pandas as pd
import string

In [63]:
def preprocess_text(text):
    '''
    Preprocess the text: remove punctuation, tokenize, remove stop words, stemming and lemmatization
    '''
    print("\n--Preprocessing text")

    #removing punctuations
    text_no_punct = text.translate(str.maketrans("","",string.punctuation))
    print("Text after removing punctuation (First 200 chars):")
    print(text_no_punct[:200])
    print('\n')

    #tokenize text
    tokens=[word.lower() for word in word_tokenize(text_no_punct)]
    print("tokenized words (first 11) :")
    print(tokens[:11])
    print("\n")

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    print("After Stop Words Removal (First 10):")
    print(filtered_tokens[:10])
    print("\n")

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    print("After Stemming (First 10):")
    print(stemmed_tokens[:10])
    print("\n")

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    print("After Lemmatization (First 10):")
    print(lemmatized_tokens[:10])
    print("\n")

    return lemmatized_tokens

In [78]:
def calculate_word_probabilities(tokens):
    '''
    Calculates word frequencies and probabilities.
    '''
    print("\n Calculating word frequencies and probabilites\n")
    total_words=len(tokens)
    word_counts=Counter(tokens)
    word_probabilites={word:count/total_words for word,count in word_counts.items()}
    print("Word Counts (First 10):")
    print(dict(list(word_counts.items())[:10]))
    print("\n")
    
    print("Word Probabilities (First 10):")
    for word, prob in list(word_probabilities.items())[:10]:
        print(f"{word}: {prob:.4f}")
    print("\n")

    return word_counts, word_probabilities

In [79]:
csv_file="unstructured.csv" #kaggle dataset for tweets

In [80]:
try:
    df = pd.read_csv(csv_file, encoding='ISO-8859-1')
    #getting only tweets from the dataset
    df = df.iloc[:,4]
except Exception as e:
    print(f"Error reading CSV file: {e}")
    
print("\n--- Loaded Dataset ---\n")
print(df.head())
print("\n")


--- Loaded Dataset ---

0    @AppleSupport causing the reply to be disregar...
1    @105835 Your business means a lot to us. Pleas...
2    @76328 I really hope you all change but I'm su...
3    @105836 LiveChat is online at the moment - htt...
4    @VirginTrains see attached error message. I've...
Name: text, dtype: object




In [81]:
# Combine all messages for preprocessing
combined_text = ' '.join(df)
    
# Preprocess the text
tokens = preprocess_text(combined_text)
    
# Calculate word counts and probabilities
calculate_word_probabilities(tokens)


--Preprocessing text
Text after removing punctuation (First 200 chars):
AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is openedð¡ð¡ð¡ 105835 Your business means a lot to us Please DM your name zip code and additional


tokenized words (first 11) :
['applesupport', 'causing', 'the', 'reply', 'to', 'be', 'disregarded', 'and', 'the', 'tapped', 'notification']


After Stop Words Removal (First 10):
['applesupport', 'causing', 'reply', 'disregarded', 'tapped', 'notification', 'keyboard', '105835', 'business', 'means']


After Stemming (First 10):
['applesupport', 'caus', 'repli', 'disregard', 'tap', 'notif', 'keyboard', '105835', 'busi', 'mean']


After Lemmatization (First 10):
['applesupport', 'caus', 'repli', 'disregard', 'tap', 'notif', 'keyboard', '105835', 'busi', 'mean']



 Calculating word frequencies and probabilites

Word Counts (First 10):
{'applesupport': 11, 'caus': 3, 'repli': 4, 'disregard': 1, 'tap': 1, 'notif': 1, 'k

(Counter({'u': 25,
          'help': 21,
          'dm': 19,
          'thank': 14,
          'httpstcogdrqu22ypt': 12,
          'applesupport': 11,
          'plea': 11,
          'updat': 10,
          'use': 10,
          'get': 10,
          'tri': 9,
          'phone': 9,
          'version': 9,
          'hi': 9,
          'app': 9,
          'ive': 8,
          'look': 8,
          'let': 8,
          'tesco': 8,
          'time': 7,
          'devic': 7,
          'send': 7,
          'io': 7,
          '76099': 7,
          'spotifycar': 7,
          'call': 6,
          'back': 6,
          'amp': 6,
          'gt': 6,
          'happi': 6,
          'know': 6,
          'wed': 6,
          'issu': 6,
          'name': 5,
          'week': 5,
          'happen': 5,
          'ay': 5,
          'log': 5,
          'well': 5,
          'sorri': 5,
          'batteri': 5,
          'repli': 4,
          'hope': 4,
          '105836': 4,
          '3': 4,
          'work': 4,
  

### Reading Text

In [90]:
import re
from nltk import pos_tag

In [82]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [83]:
# Initialize objects
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [84]:
def preprocess_text(text):
    text=re.sub(r'\d+','',text.lower())
    text=text.translate(str.maketrans('','',string.punctuation)).strip()
    tokens=word_tokenize(text)
    tokens=[word for word in tokens if word not in stop_words]
    stemmed_tokens=[stemmer.stem(word) for word in tokens]
    lemmatized_tokens=[lemmatizer.lemmatize(word) for word in tokens]
    pos_tags=pos_tag(tokens)
    return {
        "original_text": text,
        "tokens": tokens,
        "stemmed_tokens": stemmed_tokens,
        "lemmatized_tokens": lemmatized_tokens,
        "pos_tags": pos_tags
    }

In [85]:
input_file="Chelsea.txt"

In [86]:
with open(input_file, 'r') as file:
    lines = file.readlines()

In [91]:
# Preprocess each line
results = []
for line in lines:
    processed = preprocess_text(line)
    results.append(processed)

# Display results
for i, result in enumerate(results):
    print(f"Review {i+1}:")
    print(f"Original Text: {lines[i].strip()}")
    print(f"Lowercased, No Digits/Punctuations: {result['original_text']}")
    print(f"Tokens: {result['tokens']}")
    print(f"Stemmed Tokens: {result['stemmed_tokens']}")
    print(f"Lemmatized Tokens: {result['lemmatized_tokens']}")
    print(f"POS Tags: {result['pos_tags']}")
    print("\n" + "-"*50 + "\n")

Review 1:
Original Text: Chelsea Football Club is a professional football club based in Fulham, West London, England. Named after neighbouring area Chelsea, they compete in the Premier League, the top tier of English football. Founded in 1905, the team play their home games at Stamford Bridge.[4] The club won their first major honour, the League championship, in 1955. They won the FA Cup for the first time in 1970, won their first European honour, the Cup Winners' Cup, in 1971, and became the third English club to win the Club World Cup in 2022.
Lowercased, No Digits/Punctuations: chelsea football club is a professional football club based in fulham west london england named after neighbouring area chelsea they compete in the premier league the top tier of english football founded in  the team play their home games at stamford bridge the club won their first major honour the league championship in  they won the fa cup for the first time in  won their first european honour the cup winne

### Trying Web Scrapped Text

In [94]:
import requests

url = 'https://www.troyhunt.com/the-773-million-record-collection-1-data-reach/'
res = requests.get(url)
html_page = res.content

In [96]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_page, 'html.parser')

In [97]:
text = soup.find_all(text=True)

  text = soup.find_all(text=True)


In [112]:
output = ''
blacklist = [
    '[document]',
    'noscript',
    'header',
    'html',
    'meta',
    'head', 
    'input',
    'script',
    # there may be more elements you don't want, such as "style", etc.
]

for t in text:
    if t.parent.name not in blacklist:
        output += '{} '.format(t)

print(output)

Troy Hunt: The 773 Million Record "Collection #1" Data Breach @font-face {font-family:Vollkorn;font-style:normal;font-weight:400;src:url(/cf-fonts/s/vollkorn/5.0.18/greek/400/normal.woff2);unicode-range:U+0370-03FF;font-display:swap;}@font-face {font-family:Vollkorn;font-style:normal;font-weight:400;src:url(/cf-fonts/s/vollkorn/5.0.18/latin-ext/400/normal.woff2);unicode-range:U+0100-02AF,U+0304,U+0308,U+0329,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20CF,U+2113,U+2C60-2C7F,U+A720-A7FF;font-display:swap;}@font-face {font-family:Vollkorn;font-style:normal;font-weight:400;src:url(/cf-fonts/s/vollkorn/5.0.18/cyrillic/400/normal.woff2);unicode-range:U+0301,U+0400-045F,U+0490-0491,U+04B0-04B1,U+2116;font-display:swap;}@font-face {font-family:Vollkorn;font-style:normal;font-weight:400;src:url(/cf-fonts/s/vollkorn/5.0.18/latin/400/normal.woff2);unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+21

In [113]:
output=output[10000:20000]

In [114]:
output

'is not perfect nor does it need to be. It\'ll be 99.x% perfect though and that x% has very little bearing on the practical use of this data. And yes, they\'re all now in Pwned Passwords, more on that soon. That\'s the numbers, let\'s move onto where the data has actually come from. Data Origins Last week, multiple people reached out and directed me to a large collection of files on the popular cloud service, MEGA (the data has since been removed from the service). The collection totalled over 12,000 separate files and more than 87GB of data. One of my contacts pointed me to a popular hacking forum where the data was being socialised, complete with the following image: kg-card-begin: markdown \n kg-card-end: markdown As you can see at the top left of the image, the root folder is called "Collection #1" hence the name I\'ve given this breach. The expanded folders and file listing give you a bit of a sense of the nature of the data (I\'ll come back to the word "combo" later), and as you 

In [124]:
result=preprocess_text(output)
print(result)

{'original_text': 'is not perfect nor does it need to be itll be x perfect though and that x has very little bearing on the practical use of this data and yes theyre all now in pwned passwords more on that soon thats the numbers lets move onto where the data has actually come from data origins last week multiple people reached out and directed me to a large collection of files on the popular cloud service mega the data has since been removed from the service the collection totalled over  separate files and more than gb of data one of my contacts pointed me to a popular hacking forum where the data was being socialised complete with the following image kgcardbegin markdown \n kgcardend markdown as you can see at the top left of the image the root folder is called collection  hence the name ive given this breach the expanded folders and file listing give you a bit of a sense of the nature of the data ill come back to the word combo later and as you can see its allegedly from many differe

In [126]:
print(f"Original Text: {result['original_text'].strip()}")
print(f"Lowercased, No Digits/Punctuations: {result['original_text']}")
print(f"Tokens: {result['tokens']}")
print(f"Stemmed Tokens: {result['stemmed_tokens']}")
print(f"Lemmatized Tokens: {result['lemmatized_tokens']}")
print(f"POS Tags: {result['pos_tags']}")
print("\n" + "-"*50 + "\n")

Original Text: is not perfect nor does it need to be itll be x perfect though and that x has very little bearing on the practical use of this data and yes theyre all now in pwned passwords more on that soon thats the numbers lets move onto where the data has actually come from data origins last week multiple people reached out and directed me to a large collection of files on the popular cloud service mega the data has since been removed from the service the collection totalled over  separate files and more than gb of data one of my contacts pointed me to a popular hacking forum where the data was being socialised complete with the following image kgcardbegin markdown 
 kgcardend markdown as you can see at the top left of the image the root folder is called collection  hence the name ive given this breach the expanded folders and file listing give you a bit of a sense of the nature of the data ill come back to the word combo later and as you can see its allegedly from many different so

#### END OF NOTEBOOK #### 