## Utility for Data Fetch & mark-up removal from a given URL

In [60]:
import nbconvert
import requests
from bs4 import BeautifulSoup

def get_clean_text_from_url(url: str) -> str:
    try:
        # Fetch content from the URL
        response = requests.get(url)
        response.raise_for_status()
        content = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content
        
        # Get the text without HTML tags
        clean_content = content.get_text(separator='\n')
        
        return clean_content.strip()  # Return the clean text after removing extra spaces
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

# Example usage
url = 'https://www.djangoproject.com/'
clean_content = get_clean_text_from_url(url)
print(clean_content)


The web framework for perfectionists with deadlines | Django




















Django


The web framework for perfectionists with deadlines.






Toggle theme (current theme: auto)


Toggle theme (current theme: light)


Toggle theme (current theme: dark)


Toggle Light / Dark / Auto color theme




























Menu










Overview






Download






Documentation






News






Community






Code






Issues






About






♥ Donate








Toggle theme (current theme: auto)


Toggle theme (current theme: light)


Toggle theme (current theme: dark)


Toggle Light / Dark / Auto color theme








































Until October 6, 2024, get PyCharm at 30% off. All money goes to the DSF!








Django




Django makes it easier to build better web apps more quickly and with less code.






Get started with Django
















Meet Django



      Django is a high-level Python web framework that encourages rapid development and clean, pragma

Compute the Type-to-Token ratio of the input URL

In [57]:
import requests
from bs4 import BeautifulSoup
import re

def get_clean_text_from_url(url: str) -> str:
    try:
        # Fetch content from the URL
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the text without HTML tags
        clean_text = soup.get_text(separator=' ')
        
        return clean_text.strip()  # Return the clean text after removing extra spaces
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

def compute_ttr(text: str) -> float:
    # Tokenize the text (split into words)
    words = re.findall(r'\b\w+\b', text.lower()) 
    
    # Count the total number of words (tokens)
    total_tokens = len(words)
    
    # Count the number of unique words (types)
    unique_words = set(words)
    total_types = len(unique_words)
    
    # Calculate the Type-to-Token Ratio (TTR)
    ttr = total_types / total_tokens if total_tokens > 0 else 0
    
    return ttr, total_tokens, total_types

# Example usage
url = 'https://www.djangoproject.com/'
clean_text = get_clean_text_from_url(url)
ttr, total_tokens, total_types = compute_ttr(clean_text)

print(f"Type-to-Token Ratio (TTR): {ttr}")
print(f"Total Tokens: {total_tokens}")
print(f"Total Types: {total_types}")


Type-to-Token Ratio (TTR): 0.4462025316455696
Total Tokens: 632
Total Types: 282


Using regular expressions, identify the set of words satisfying following conditions
(a) Ending with ‘ise‘
(b) Containing the letter ’z’
(c) Containing the sequnce “en”
(d) Having all lowercase letter except for an initial capital (i.e titlecase)

In [58]:
import re
def find_words_with_conditions(text: str):
    # Tokenize the text into words
    words = re.findall(r'\b\w+\b', text)
    
    # (a) Words ending with 'ise'
    ise_words = [word for word in words if re.search(r'ise$', word)]
    
    # (b) Words containing the letter 'z'
    z_words = [word for word in words if re.search(r'z', word)]
    
    # (c) Words containing the sequence 'en'
    en_words = [word for word in words if re.search(r'en', word)]
    
    # (d) Words in titlecase
    titlecase_words = [word for word in words if re.match(r'^[A-Z][a-z]+$', word)]
    
    return ise_words, z_words, en_words, titlecase_words

# Example usage
url = 'https://www.djangoproject.com/'
clean_text = get_clean_text_from_url(url)
ise_words, z_words, en_words, titlecase_words = find_words_with_conditions(clean_text)

print("Words ending with 'ise':", ise_words)
print("Words containing the letter 'z':", z_words)
print("Words containing the sequence 'en':", en_words)
print("Words in titlecase:", titlecase_words)


Words ending with 'ise': ['merchandise']
Words containing the letter 'z': ['specialized', 'internationalization', 'Organization']
Words containing the sequence 'en': ['current', 'current', 'current', 'Menu', 'Documentation', 'current', 'current', 'current', 'encourages', 'development', 'experienced', 'development', 'reinvent', 'open', 'announcements', 'sending', 'sent', 'development', 'frequently', 'sending', 'sent', 'documentation', 'documentation', 'Henschel', 'development', 'open', 'encore', 'Development', 'currently', 'Statement', 'Benevity']
Words in titlecase: ['The', 'Django', 'Django', 'The', 'Toggle', 'Toggle', 'Toggle', 'Toggle', 'Light', 'Dark', 'Auto', 'Menu', 'Overview', 'Download', 'Documentation', 'News', 'Community', 'Code', 'Issues', 'About', 'Donate', 'Toggle', 'Toggle', 'Toggle', 'Toggle', 'Light', 'Dark', 'Auto', 'Until', 'October', 'All', 'Django', 'Django', 'Get', 'Django', 'Meet', 'Django', 'Django', 'Python', 'Built', 'It', 'Ridiculously', 'Django', 'Reassuringl

Print the top-10 words based on the frequency in the following POS category: Noun, Verb, Adjective

In [43]:
!pip install nltk # If the NLTK library isn't installed
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import requests
from bs4 import BeautifulSoup
from collections import Counter

def get_clean_text_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        clean_text = soup.get_text(separator=' ')
        
        return clean_text.strip()  # Return the clean text after removing extra spaces
    except requests.exceptions.RequestException as e:
        return None

def get_top_words_by_pos(text: str):
    if text is None:
        print("No text available for analysis.")
        return

    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Tag the words with their parts of speech
    pos_tags = nltk.pos_tag(words)

    # Filter words based on POS categories
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    verbs = [word for word, pos in pos_tags if pos.startswith('VB')]
    adjectives = [word for word, pos in pos_tags if pos.startswith('JJ')]

    # Count the frequency of each word
    noun_freq = Counter(nouns).most_common(10)
    verb_freq = Counter(verbs).most_common(10)
    adjective_freq = Counter(adjectives).most_common(10)

    return noun_freq, verb_freq, adjective_freq

# Example usage
url = 'https://www.djangoproject.com/'
clean_text = get_clean_text_from_url(url)
# Get top words for each POS category
noun_freq, verb_freq, adjective_freq = get_top_words_by_pos(clean_text)

# Print the results
print("Top 10 Nouns:", noun_freq)
print("Top 10 Verbs:", verb_freq)
print("Top 10 Adjectives:", adjective_freq)



[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!

ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


Top 10 Nouns: [('Django', 51), ('theme', 14), ('Toggle', 8), ('Foundation', 8), ('Software', 7), ('web', 5), ('Get', 5), ('/', 4), ('development', 4), ('Support', 4)]
Top 10 Verbs: [('makes', 2), ('started', 2), ('is', 2), ('takes', 2), ('subscribe', 2), ('sending', 2), ('following', 2), ('be', 2), ('sent', 2), ('Contributing', 2)]
Top 10 Adjectives: [('current', 6), ('more', 3), ('open', 2), ('other', 2), ('More', 2), ('easier', 1), ('better', 1), ('less', 1), ('high-level', 1), ('rapid', 1)]


A prepositional phrase is defined as a group of words that begins with a preposition and ends with a noun, pronoun, or noun phrase (Det Noun, Det Adj Noun, etc). Examples: “down the stairs“, above the table. From the text content scraped from the URL, find all prepositional phrases.

In [56]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.chunk import RegexpParser

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_text_from_url(url: str) -> str:
    try:
        # Fetch content from the URL
        response = requests.get(url)
        response.raise_for_status()
        content = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content
        
        # Get the text without HTML tags
        clean_content = content.get_text(separator='\n')
        
        return clean_content.strip()  # Return the clean text after removing extra spaces
    except requests.exceptions.RequestException as e:
        return f"An error occurred: {e}"

def extract_prepositional_phrases(text):
    if not text:  # Check for None or empty string
        return "Empty Q"

    # Tokenize and POS tagging
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)

    # Define a grammar for prepositional phrases
    grammar = r"""
      PP: {<IN> <DT>? (<JJ>? <NN.*>)+}  # Prepositional Phrase
    """
    
    # Create a parser
    parser = RegexpParser(grammar)
    
    # Parse the tagged tokens
    parsed_tree = parser.parse(tagged_tokens)
    
    # Extract prepositional phrases
    prepositional_phrases = [' '.join(word for word, pos in subtree.leaves())
                             for subtree in parsed_tree.subtrees() if subtree.label() == 'PP']

    return prepositional_phrases

# Example usage
url = 'https://www.djangoproject.com/'
url_content = get_text_from_url(url)
prepositional_phrases = extract_prepositional_phrases(url_content)

print("Prepositional Phrases:", prepositional_phrases)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jayap\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Prepositional Phrases: ['for perfectionists', 'with deadlines | Django Django', 'for perfectionists', 'with deadlines', 'About ♥ Donate Toggle theme', 'Until October', 'with Django Meet Django Django', 'by experienced developers', 'of the hassle', 'of web development', 'from concept', 'on the web leverage Django ’', 'about Django Stay', 'in the loop Subscribe', 'with everything', 'in the Django community', 'with Django', 'of Django', 'of Directors', 'by Thibaud Colas', 'on September', 'off PyCharm', 'by Thibaud Colas', 'on September', 'guide Write', 'with Django', 'of Django Object-relational mapper Automatic admin interface Robust template system Quick internationalization Explore', 'Inside the Django community Get Help', 'with other Django users Django Discord Server Join', 'on the Django Forum', 'About the Foundation', 'about the DSF Django Links Learn', 'About Django Getting', 'with Django Team Organization Django Software Foundation Code', 'of Conduct Diversity Statement Get Invol