In [4]:

text ="""
Globalization is a multifaceted process characterized by the increasing interconnectedness and interdependence of countries worldwide. This integration spans economic, social, cultural, political, and technological dimensions, driven by factors such as advancements in communication and transportation, the liberalization of trade and capital flows, and the spread of information. Essentially, globalization signifies a shift towards a more integrated global society where borders become less significant in various aspects of life.   

Economically, globalization manifests in the expansion of international trade, the rise of multinational corporations, and the interconnectedness of financial markets. This has led to increased competition, the specialization of production across different nations, and the potential for greater economic growth. Consumers often benefit from a wider variety of goods and services at potentially lower prices. However, this economic integration also presents challenges such as increased competition for domestic industries, potential job displacement in some sectors, and the risk of global economic crises spreading rapidly. """

In [6]:
#count the characters in the text
text_char = len(text)

In [8]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
#from collections import Counter


In [9]:
#load the spacy model
nlp = spacy.load("en_core_web_sm")

In [10]:
#process the text with spacy
doc = nlp(text)

In [None]:
# Create a list of tokens by iterating through each token in the 'doc' object.
# For each token, convert it to lowercase using token.text.lower().
# Apply a series of filters to include only relevant tokens:
    # Exclude punctuation using 'not token.is_punct'.
    # Exclude whitespace characters using 'not token.is_space'.
    # Exclude stop words (common words like 'the', 'a', 'is') using 'not token.is_stop'.
    # Explicitly exclude newline characters ('\n').
    # Explicitly exclude tab characters ('\t').
    # Explicitly exclude carriage return characters ('\r').
# The result is a list named 'tokens' containing the cleaned and lowercased textual content of the 'doc' object.
tokens = [token.text.lower() for token in doc
          if not token.is_punct  # Remove punctuation
          and not token.is_space  # Remove whitespace
          and not token.is_stop   # Remove stop words
          and token.text not in ["\n", "\t", "\r"]] # Remove newline/tab/carriage return

In [12]:
tokens

['globalization',
 'multifaceted',
 'process',
 'characterized',
 'increasing',
 'interconnectedness',
 'interdependence',
 'countries',
 'worldwide',
 'integration',
 'spans',
 'economic',
 'social',
 'cultural',
 'political',
 'technological',
 'dimensions',
 'driven',
 'factors',
 'advancements',
 'communication',
 'transportation',
 'liberalization',
 'trade',
 'capital',
 'flows',
 'spread',
 'information',
 'essentially',
 'globalization',
 'signifies',
 'shift',
 'integrated',
 'global',
 'society',
 'borders',
 'significant',
 'aspects',
 'life',
 'economically',
 'globalization',
 'manifests',
 'expansion',
 'international',
 'trade',
 'rise',
 'multinational',
 'corporations',
 'interconnectedness',
 'financial',
 'markets',
 'led',
 'increased',
 'competition',
 'specialization',
 'production',
 'different',
 'nations',
 'potential',
 'greater',
 'economic',
 'growth',
 'consumers',
 'benefit',
 'wider',
 'variety',
 'goods',
 'services',
 'potentially',
 'lower',
 'prices',

In [None]:
#alternative way to create tokens lists
tokens1 = []  # Initialize an empty list to store the processed tokens.
stopwords = list(STOP_WORDS)  # Get the default stop words from spaCy and convert them to a list for efficient 'in' checking.
allowed_pos = ['ADJ', 'PROPN', 'NOUN', 'VERB', 'ADV']  # Define a list of allowed Part-of-Speech (POS) tags that we want to keep (Adjective, Proper Noun, Noun, Verb, Adverb).

for token in doc:  # Iterate through each individual token that has been processed by the spaCy language model.
    if token.text in stopwords or token.text in punctuation:
        continue  # If the current token's text is found within our list of stop words OR if it's a punctuation mark, we skip this token and move to the next one.
    if token.pos_ in allowed_pos:
        tokens1.append(token.text.lower())  # If the current token's Part-of-Speech tag is present in our 'allowed_pos' list, we convert the token's text to lowercase and append it to our 'tokens1' list.



In [15]:
tokens1

['globalization',
 'multifaceted',
 'process',
 'characterized',
 'increasing',
 'interconnectedness',
 'interdependence',
 'countries',
 'worldwide',
 'integration',
 'spans',
 'economic',
 'social',
 'cultural',
 'political',
 'technological',
 'dimensions',
 'driven',
 'factors',
 'advancements',
 'communication',
 'transportation',
 'liberalization',
 'trade',
 'capital',
 'flows',
 'spread',
 'information',
 'essentially',
 'globalization',
 'signifies',
 'shift',
 'integrated',
 'global',
 'society',
 'borders',
 'significant',
 'aspects',
 'life',
 'economically',
 'globalization',
 'manifests',
 'expansion',
 'international',
 'trade',
 'rise',
 'multinational',
 'corporations',
 'interconnectedness',
 'financial',
 'markets',
 'led',
 'increased',
 'competition',
 'specialization',
 'production',
 'different',
 'nations',
 'potential',
 'greater',
 'economic',
 'growth',
 'consumers',
 'benefit',
 'wider',
 'variety',
 'goods',
 'services',
 'potentially',
 'lower',
 'prices',

In [17]:
from collections import Counter

In [18]:
word_freq = Counter(tokens1) # Create a Counter object to count the frequency of each token in the 'tokens1' list.

In [19]:
word_freq

Counter({'economic': 4,
         'globalization': 3,
         'interconnectedness': 2,
         'integration': 2,
         'trade': 2,
         'global': 2,
         'increased': 2,
         'competition': 2,
         'potential': 2,
         'multifaceted': 1,
         'process': 1,
         'characterized': 1,
         'increasing': 1,
         'interdependence': 1,
         'countries': 1,
         'worldwide': 1,
         'spans': 1,
         'social': 1,
         'cultural': 1,
         'political': 1,
         'technological': 1,
         'dimensions': 1,
         'driven': 1,
         'factors': 1,
         'advancements': 1,
         'communication': 1,
         'transportation': 1,
         'liberalization': 1,
         'capital': 1,
         'flows': 1,
         'spread': 1,
         'information': 1,
         'essentially': 1,
         'signifies': 1,
         'shift': 1,
         'integrated': 1,
         'society': 1,
         'borders': 1,
         'significant': 1,
     

In [20]:
max_freq = max(word_freq.values()) # Find the maximum frequency value from the 'word_freq' Counter object.
max_freq

4

In [21]:
for word in word_freq.keys(): # Iterate through each unique word in the 'word_freq' Counter object.
    word_freq[word] = (word_freq[word] / max_freq) # Normalize the frequency of each word by dividing its count by the maximum frequency value.  

In [22]:
word_freq

Counter({'economic': 1.0,
         'globalization': 0.75,
         'interconnectedness': 0.5,
         'integration': 0.5,
         'trade': 0.5,
         'global': 0.5,
         'increased': 0.5,
         'competition': 0.5,
         'potential': 0.5,
         'multifaceted': 0.25,
         'process': 0.25,
         'characterized': 0.25,
         'increasing': 0.25,
         'interdependence': 0.25,
         'countries': 0.25,
         'worldwide': 0.25,
         'spans': 0.25,
         'social': 0.25,
         'cultural': 0.25,
         'political': 0.25,
         'technological': 0.25,
         'dimensions': 0.25,
         'driven': 0.25,
         'factors': 0.25,
         'advancements': 0.25,
         'communication': 0.25,
         'transportation': 0.25,
         'liberalization': 0.25,
         'capital': 0.25,
         'flows': 0.25,
         'spread': 0.25,
         'information': 0.25,
         'essentially': 0.25,
         'signifies': 0.25,
         'shift': 0.25,
       

In [28]:
sent_token = [sent.text for sent in doc.sents] # Create a list of sentences by iterating through each sentence in the processed document.
#sent_token
sent_scores = {} # Initialize an empty dictionary to store the scores of each sentence.
#for sent in doc.sents: # Iterate through each sentence in the processed document.
    #for word in sent: # For each word in the current sentence.
        #if word.text.lower() in word_freq.keys(): # Check if the lowercase version of the word is present in our 'word_freq' dictionary.
            #if sent in sent_scores.keys(): # If the current sentence is already present in our 'sent_scores' dictionary.
                #sent_scores[sent] += word_freq[word.text.lower()] # Increment the score of the current sentence by the frequency of the word.
           # else:
                #sent_scores[sent] = word_freq[word.text.lower()] # If the sentence is not already present, initialize its score with the frequency of the word.

for sent in sent_token: # Iterate through each sentence in the processed document.
  for word in sent.split(): # For each word in the current sentence.
    if word.lower() in word_freq.keys():
      if sent not in sent_scores.keys(): # If the current sentence is not already present in our 'sent_scores' dictionary.
        sent_scores[sent] = word_freq[word]# Initialize the score of the current sentence with the frequency of the word.
      else:
      
          sent_scores[sent] += word_freq[word] # Increment the score of the current sentence by the frequency of the word.

  print(word) # Print the word being processed.
  print(sent) # Print the sentence being processed. 


worldwide.

Globalization is a multifaceted process characterized by the increasing interconnectedness and interdependence of countries worldwide.
information.
This integration spans economic, social, cultural, political, and technological dimensions, driven by factors such as advancements in communication and transportation, the liberalization of trade and capital flows, and the spread of information.
life.
Essentially, globalization signifies a shift towards a more integrated global society where borders become less significant in various aspects of life.   


markets.
Economically, globalization manifests in the expansion of international trade, the rise of multinational corporations, and the interconnectedness of financial markets.
growth.
This has led to increased competition, the specialization of production across different nations, and the potential for greater economic growth.
prices.
Consumers often benefit from a wider variety of goods and services at potentially lower price

In [29]:
sent_scores # Display the final scores of each sentence.

{'\nGlobalization is a multifaceted process characterized by the increasing interconnectedness and interdependence of countries worldwide.': 2.0,
 'This integration spans economic, social, cultural, political, and technological dimensions, driven by factors such as advancements in communication and transportation, the liberalization of trade and capital flows, and the spread of information.': 3.25,
 'Essentially, globalization signifies a shift towards a more integrated global society where borders become less significant in various aspects of life. \xa0 \n\n': 3.0,
 'Economically, globalization manifests in the expansion of international trade, the rise of multinational corporations, and the interconnectedness of financial markets.': 2.75,
 'This has led to increased competition, the specialization of production across different nations, and the potential for greater economic growth.': 3.25,
 'Consumers often benefit from a wider variety of goods and services at potentially lower pric

In [30]:
import pandas as pd
# Create a DataFrame from the 'sent_scores' dictionary, where the index is the sentence and the values are the scores.
#df = pd.DataFrame.from_dict(sent_scores, orient='index', columns=['Score']) # Create a DataFrame from the 'sent_scores' dictionary, where the index is the sentence and the values are the scores.
#df = df.sort_values(by='Score', ascending=False) # Sort the DataFrame by the 'Score' column in descending order.
pd.DataFrame(list(sent_scores.items()), columns=['Sentence', 'Score']).sort_values(by='Score', ascending=False) # Create a DataFrame from the 'sent_scores' dictionary, where the index is the sentence and the values are the scores.

Unnamed: 0,Sentence,Score
6,"However, this economic integration also presen...",6.5
1,"This integration spans economic, social, cultu...",3.25
4,"This has led to increased competition, the spe...",3.25
2,"Essentially, globalization signifies a shift t...",3.0
3,"Economically, globalization manifests in the e...",2.75
0,\nGlobalization is a multifaceted process char...,2.0
5,Consumers often benefit from a wider variety o...,1.75


In [32]:
from heapq import nlargest # Import the 'nlargest' function from the 'heapq' module to find the largest elements in an iterable.
num_sentences = 3 # Define the number of sentences to extract for the summary.

n = nlargest(num_sentences, sent_scores, key=sent_scores.get) # Use the 'nlargest' function to find the top 'number of sentences' sentences based on their scores.
" ".join(n) # Join the selected sentences into a single string, separating them with a newline character.
# Display the final summary. 

'However, this economic integration also presents challenges such as increased competition for domestic industries, potential job displacement in some sectors, and the risk of global economic crises spreading rapidly. This integration spans economic, social, cultural, political, and technological dimensions, driven by factors such as advancements in communication and transportation, the liberalization of trade and capital flows, and the spread of information. This has led to increased competition, the specialization of production across different nations, and the potential for greater economic growth.'