In [1]:

import streamlit as st
import pandas as pd
import torch

In [5]:
text ="""
Urbanization, a transformative global phenomenon, represents the increasing concentration of human populations in densely populated areas known as cities. This intricate process, driven by a complex interplay of socio-economic, political, and environmental forces, has profoundly reshaped human societies and the landscapes they inhabit. Understanding the nuances of urbanization is crucial for navigating the challenges and harnessing the opportunities it presents in our rapidly evolving world.

The engines of urbanization are multifaceted. Economic opportunities stand as a primary draw, with cities often serving as hubs for industry, commerce, and innovation, offering a wider array of jobs and higher earning potential compared to rural areas. This economic magnetism pulls individuals and families in search of improved livelihoods. Social factors also play a significant role. Cities frequently offer greater access to education, healthcare, cultural amenities, and diverse social networks, enhancing the quality of life for many. The allure of a vibrant social scene and the promise of upward mobility further fuel urban migration."""

In [6]:
#count the characters in the text
text_char = len(text)

In [7]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
#from collections import Counter


In [8]:
#load the spacy model
nlp = spacy.load("en_core_web_sm")

In [9]:
#process the text with spacy
doc = nlp(text)

In [10]:
# Create a list of tokens by iterating through each token in the 'doc' object.
# For each token, convert it to lowercase using token.text.lower().
# Apply a series of filters to include only relevant tokens:
    # Exclude punctuation using 'not token.is_punct'.
    # Exclude whitespace characters using 'not token.is_space'.
    # Exclude stop words (common words like 'the', 'a', 'is') using 'not token.is_stop'.
    # Explicitly exclude newline characters ('\n').
    # Explicitly exclude tab characters ('\t').
    # Explicitly exclude carriage return characters ('\r').
# The result is a list named 'tokens' containing the cleaned and lowercased textual content of the 'doc' object.
tokens = [token.text.lower() for token in doc
          if not token.is_punct  # Remove punctuation
          and not token.is_space  # Remove whitespace
          and not token.is_stop   # Remove stop words
          and token.text not in ["\n", "\t", "\r"]] # Remove newline/tab/carriage return

In [11]:
tokens

['urbanization',
 'transformative',
 'global',
 'phenomenon',
 'represents',
 'increasing',
 'concentration',
 'human',
 'populations',
 'densely',
 'populated',
 'areas',
 'known',
 'cities',
 'intricate',
 'process',
 'driven',
 'complex',
 'interplay',
 'socio',
 'economic',
 'political',
 'environmental',
 'forces',
 'profoundly',
 'reshaped',
 'human',
 'societies',
 'landscapes',
 'inhabit',
 'understanding',
 'nuances',
 'urbanization',
 'crucial',
 'navigating',
 'challenges',
 'harnessing',
 'opportunities',
 'presents',
 'rapidly',
 'evolving',
 'world',
 'engines',
 'urbanization',
 'multifaceted',
 'economic',
 'opportunities',
 'stand',
 'primary',
 'draw',
 'cities',
 'serving',
 'hubs',
 'industry',
 'commerce',
 'innovation',
 'offering',
 'wider',
 'array',
 'jobs',
 'higher',
 'earning',
 'potential',
 'compared',
 'rural',
 'areas',
 'economic',
 'magnetism',
 'pulls',
 'individuals',
 'families',
 'search',
 'improved',
 'livelihoods',
 'social',
 'factors',
 'play'

In [12]:
#alternative way to create tokens lists
tokens1 = []  # Initialize an empty list to store the processed tokens.
stopwords = list(STOP_WORDS)  # Get the default stop words from spaCy and convert them to a list for efficient 'in' checking.
allowed_pos = ['ADJ', 'PROPN', 'NOUN', 'VERB', 'ADV']  # Define a list of allowed Part-of-Speech (POS) tags that we want to keep (Adjective, Proper Noun, Noun, Verb, Adverb).

for token in doc:  # Iterate through each individual token that has been processed by the spaCy language model.
    if token.text in stopwords or token.text in punctuation:
        continue  # If the current token's text is found within our list of stop words OR if it's a punctuation mark, we skip this token and move to the next one.
    if token.pos_ in allowed_pos:
        tokens1.append(token.text.lower())  # If the current token's Part-of-Speech tag is present in our 'allowed_pos' list, we convert the token's text to lowercase and append it to our 'tokens1' list.



In [13]:
tokens1

['urbanization',
 'transformative',
 'global',
 'phenomenon',
 'represents',
 'increasing',
 'concentration',
 'human',
 'populations',
 'densely',
 'populated',
 'areas',
 'known',
 'cities',
 'intricate',
 'process',
 'driven',
 'complex',
 'interplay',
 'socio',
 'economic',
 'political',
 'environmental',
 'forces',
 'profoundly',
 'reshaped',
 'human',
 'societies',
 'landscapes',
 'inhabit',
 'understanding',
 'nuances',
 'urbanization',
 'crucial',
 'navigating',
 'challenges',
 'harnessing',
 'opportunities',
 'presents',
 'rapidly',
 'evolving',
 'world',
 'engines',
 'urbanization',
 'multifaceted',
 'economic',
 'opportunities',
 'stand',
 'primary',
 'draw',
 'cities',
 'serving',
 'hubs',
 'industry',
 'commerce',
 'innovation',
 'offering',
 'wider',
 'array',
 'jobs',
 'higher',
 'earning',
 'potential',
 'compared',
 'rural',
 'areas',
 'economic',
 'magnetism',
 'pulls',
 'individuals',
 'families',
 'search',
 'improved',
 'livelihoods',
 'social',
 'factors',
 'play'

In [14]:
from collections import Counter

In [15]:
word_freq = Counter(tokens1) # Create a Counter object to count the frequency of each token in the 'tokens1' list.

In [16]:
word_freq

Counter({'urbanization': 3,
         'cities': 3,
         'economic': 3,
         'social': 3,
         'human': 2,
         'areas': 2,
         'opportunities': 2,
         'transformative': 1,
         'global': 1,
         'phenomenon': 1,
         'represents': 1,
         'increasing': 1,
         'concentration': 1,
         'populations': 1,
         'densely': 1,
         'populated': 1,
         'known': 1,
         'intricate': 1,
         'process': 1,
         'driven': 1,
         'complex': 1,
         'interplay': 1,
         'socio': 1,
         'political': 1,
         'environmental': 1,
         'forces': 1,
         'profoundly': 1,
         'reshaped': 1,
         'societies': 1,
         'landscapes': 1,
         'inhabit': 1,
         'understanding': 1,
         'nuances': 1,
         'crucial': 1,
         'navigating': 1,
         'challenges': 1,
         'harnessing': 1,
         'presents': 1,
         'rapidly': 1,
         'evolving': 1,
         'world

In [18]:
max_freq = max(word_freq.values()) # Find the maximum frequency value from the 'word_freq' Counter object.
max_freq

3

In [19]:
for word in word_freq.keys(): # Iterate through each unique word in the 'word_freq' Counter object.
    word_freq[word] = (word_freq[word] / max_freq) # Normalize the frequency of each word by dividing its count by the maximum frequency value.  

In [20]:
word_freq

Counter({'urbanization': 1.0,
         'cities': 1.0,
         'economic': 1.0,
         'social': 1.0,
         'human': 0.6666666666666666,
         'areas': 0.6666666666666666,
         'opportunities': 0.6666666666666666,
         'transformative': 0.3333333333333333,
         'global': 0.3333333333333333,
         'phenomenon': 0.3333333333333333,
         'represents': 0.3333333333333333,
         'increasing': 0.3333333333333333,
         'concentration': 0.3333333333333333,
         'populations': 0.3333333333333333,
         'densely': 0.3333333333333333,
         'populated': 0.3333333333333333,
         'known': 0.3333333333333333,
         'intricate': 0.3333333333333333,
         'process': 0.3333333333333333,
         'driven': 0.3333333333333333,
         'complex': 0.3333333333333333,
         'interplay': 0.3333333333333333,
         'socio': 0.3333333333333333,
         'political': 0.3333333333333333,
         'environmental': 0.3333333333333333,
         'forces': 0

In [21]:
sent_token = [sent.text for sent in doc.sents] # Create a list of sentences by iterating through each sentence in the processed document.
#sent_token
sent_scores = {} # Initialize an empty dictionary to store the scores of each sentence.
#for sent in doc.sents: # Iterate through each sentence in the processed document.
    #for word in sent: # For each word in the current sentence.
        #if word.text.lower() in word_freq.keys(): # Check if the lowercase version of the word is present in our 'word_freq' dictionary.
            #if sent in sent_scores.keys(): # If the current sentence is already present in our 'sent_scores' dictionary.
                #sent_scores[sent] += word_freq[word.text.lower()] # Increment the score of the current sentence by the frequency of the word.
           # else:
                #sent_scores[sent] = word_freq[word.text.lower()] # If the sentence is not already present, initialize its score with the frequency of the word.

for sent in sent_token: # Iterate through each sentence in the processed document.
  for word in sent.split(): # For each word in the current sentence.
    if word.lower() in word_freq.keys():
      if sent not in sent_scores.keys(): # If the current sentence is not already present in our 'sent_scores' dictionary.
        sent_scores[sent] = word_freq[word]# Initialize the score of the current sentence with the frequency of the word.
      else:
      
          sent_scores[sent] += word_freq[word] # Increment the score of the current sentence by the frequency of the word.

  print(word) # Print the word being processed.
  print(sent) # Print the sentence being processed. 


cities.

Urbanization, a transformative global phenomenon, represents the increasing concentration of human populations in densely populated areas known as cities.
inhabit.
This intricate process, driven by a complex interplay of socio-economic, political, and environmental forces, has profoundly reshaped human societies and the landscapes they inhabit.
world.
Understanding the nuances of urbanization is crucial for navigating the challenges and harnessing the opportunities it presents in our rapidly evolving world.


multifaceted.
The engines of urbanization are multifaceted.
areas.
Economic opportunities stand as a primary draw, with cities often serving as hubs for industry, commerce, and innovation, offering a wider array of jobs and higher earning potential compared to rural areas.
livelihoods.
This economic magnetism pulls individuals and families in search of improved livelihoods.
role.
Social factors also play a significant role.
many.
Cities frequently offer greater access to 

In [22]:
sent_scores # Display the final scores of each sentence.

{'\nUrbanization, a transformative global phenomenon, represents the increasing concentration of human populations in densely populated areas known as cities.': 4.333333333333333,
 'This intricate process, driven by a complex interplay of socio-economic, political, and environmental forces, has profoundly reshaped human societies and the landscapes they inhabit.': 3.6666666666666665,
 'Understanding the nuances of urbanization is crucial for navigating the challenges and harnessing the opportunities it presents in our rapidly evolving world.\n\n': 4.333333333333333,
 'The engines of urbanization are multifaceted.': 1.3333333333333333,
 'Economic opportunities stand as a primary draw, with cities often serving as hubs for industry, commerce, and innovation, offering a wider array of jobs and higher earning potential compared to rural areas.': 5.999999999999998,
 'This economic magnetism pulls individuals and families in search of improved livelihoods.': 3.0,
 'Social factors also play a

In [23]:
import pandas as pd
# Create a DataFrame from the 'sent_scores' dictionary, where the index is the sentence and the values are the scores.
#df = pd.DataFrame.from_dict(sent_scores, orient='index', columns=['Score']) # Create a DataFrame from the 'sent_scores' dictionary, where the index is the sentence and the values are the scores.
#df = df.sort_values(by='Score', ascending=False) # Sort the DataFrame by the 'Score' column in descending order.
pd.DataFrame(list(sent_scores.items()), columns=['Sentence', 'Score']).sort_values(by='Score', ascending=False) # Create a DataFrame from the 'sent_scores' dictionary, where the index is the sentence and the values are the scores.

Unnamed: 0,Sentence,Score
4,Economic opportunities stand as a primary draw...,6.0
0,"\nUrbanization, a transformative global phenom...",4.333333
2,Understanding the nuances of urbanization is c...,4.333333
7,Cities frequently offer greater access to educ...,4.0
8,The allure of a vibrant social scene and the p...,3.666667
1,"This intricate process, driven by a complex in...",3.666667
5,This economic magnetism pulls individuals and ...,3.0
3,The engines of urbanization are multifaceted.,1.333333
6,Social factors also play a significant role.,1.0


In [24]:
from heapq import nlargest # Import the 'nlargest' function from the 'heapq' module to find the largest elements in an iterable.
num_sentences = 3 # Define the number of sentences to extract for the summary.

n = nlargest(num_sentences, sent_scores, key=sent_scores.get) # Use the 'nlargest' function to find the top 'number of sentences' sentences based on their scores.
" ".join(n) # Join the selected sentences into a single string, separating them with a newline character.
# Display the final summary. 

'Economic opportunities stand as a primary draw, with cities often serving as hubs for industry, commerce, and innovation, offering a wider array of jobs and higher earning potential compared to rural areas. \nUrbanization, a transformative global phenomenon, represents the increasing concentration of human populations in densely populated areas known as cities. Understanding the nuances of urbanization is crucial for navigating the challenges and harnessing the opportunities it presents in our rapidly evolving world.\n\n'