### Analyze Emotions

In [1]:
from textblob import TextBlob
from transformers import pipeline
from newsplease import NewsPlease
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
emotions_class = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
classes_class = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")



In [3]:
article = NewsPlease.from_url("https://www.npr.org/2023/12/22/1221230635/japan-alleged-political-corruption-ldp-slush-fund")

In [4]:
article.description

"Japan's governing Liberal Democratic Party replaced two of its top executives as part of a purge related to investigations into alleged political slush funds."

In [5]:
article_text = article.title + "\n" + article.maintext

In [6]:
# Splitting the text into sentences
sentences = re.split(r'(?<=[.!?])\s+', article_text)

# Creating a dictionary with the number of characters, words, and the content of each sentence
sentence_dict = []
for i, sentence in enumerate(sentences, 1):
    words = sentence.split()
    dict_append = {
        'chars': len(sentence), 
        'words': len(words), 
        'content': sentence,
        'emotions': {}
    }
    for item in emotions_class(sentence)[0]:
        dict_append['emotions'][item['label']] = round(item['score'] * 100, 2)

    sentence_dict.append(dict_append)

In [7]:
emotion_sum_char = 0
emotion_value = 0
emotions_percentage = {}

for emotion in ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']:
    for sentence in sentence_dict:
        emotion_value += sentence['chars'] * sentence['emotions'][emotion]
        emotion_sum_char += sentence['chars']
    emotions_percentage[emotion] = emotion_value / emotion_sum_char
    emotion_value = 0
    emotion_sum_char = 0

In [8]:
emotions_percentage

{'anger': 12.074123116979731,
 'disgust': 25.135253859029195,
 'fear': 3.9958601450623035,
 'joy': 1.4329570392412125,
 'neutral': 49.124999070113454,
 'sadness': 7.03720104147294,
 'surprise': 1.2025813650734611}

In [9]:
sum_emotions = 0
for i in emotions_percentage.values():
    sum_emotions += i
sum_emotions

100.00297563697228

In [48]:
nltk.download('punkt')  # Required for the first time
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = tokenizer.tokenize(article_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [59]:
for sentence in sentences:
    if len(sentence) > 430:
        print(sentence)
        print("--------")

Party bosses fall in Japan's worst political corruption scandal in decades
Party bosses fall in Japan's worst political corruption scandal in decades
Enlarge this image toggle caption Kyodo via Reuters Connect Kyodo via Reuters Connect
SEOUL, South Korea — Japan's governing Liberal Democratic Party (LDP) on Friday replaced two of its top executives, as part of a purge related to the worst corruption scandal to rock the country in three decades.
--------


In [35]:
import nltk

def split_into_sentences(text):
    nltk.download('punkt')  # Required for the first time
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    return sentences

In [42]:
def balance_clusters(clusters):
    # Define a threshold for when to redistribute (e.g., if the last cluster is less than half the average size)
    average_size = sum(len(cluster) for cluster in clusters) / len(clusters)
    min_size = average_size / 2

    if len(clusters) > 1 and len(clusters[-1]) < min_size:
        # Attempt to redistribute
        last_cluster = clusters[-1].split()
        prev_cluster = clusters[-2].split()

        # While the last cluster is too short and the previous cluster has sentences to give
        while len(' '.join(last_cluster)) < min_size and prev_cluster:
            # Move the last sentence from the previous cluster to the beginning of the last cluster
            last_cluster.insert(0, prev_cluster.pop())

        # Update the clusters with the redistributed sentences
        clusters[-2] = ' '.join(prev_cluster)
        clusters[-1] = ' '.join(last_cluster)

    return clusters

In [43]:
def create_clusters(sentences):
    clusters = []
    current_cluster = ""

    for sentence in sentences:
        # Check if adding the next sentence would exceed the limit
        if len(current_cluster) + len(sentence) > 512:
            # If the current cluster is not empty, add it to clusters
            if current_cluster:
                clusters.append(current_cluster)
            # Start a new cluster with the current sentence
            current_cluster = sentence
        else:
            # Add a space if the cluster already has content
            if current_cluster:
                current_cluster += " "
            current_cluster += sentence

    # Don't forget to add the last cluster if it's not empty
    if current_cluster:
        clusters.append(current_cluster)

    return clusters

In [44]:
def process_text(article):
    sentences = split_into_sentences(article)
    initial_clusters = create_clusters(sentences)
    balanced_clusters = balance_clusters(initial_clusters)
    return balanced_clusters

In [45]:
clusters = process_text(article_text)

for i, cluster in enumerate(clusters, 1):
    print(f"Cluster {i}: {cluster[:50]}...")  # Prints the first 50 chars of each cluster

Cluster 1: Party bosses fall in Japan's worst political corru...
Cluster 2: The outgoing executives were in charge of policy a...
Cluster 3: "Japanese democracy's strength is going to be test...
Cluster 4: The LDP has only lost power twice in seven decades...
Cluster 5: Prosecutors, meanwhile, are looking into allegatio...
Cluster 6: In a statement cited by The Asahi Shimbun newspape...
Cluster 7: "This type of very clear sort of crime has been co...
Cluster 8: A poll by the Mainichi Shimbun newspaper found 79%...
Cluster 9: "The easiest way to understand factions is that th...
Cluster 10: But he says Japan must overhaul the selection of p...
Cluster 11: Critics accused Abe of trying to extend the retire...
Cluster 12: But Izumi believes the result of the scandal will ...
Cluster 13: They can't arrest lawmakers while parliament is in...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
article_text

'Party bosses fall in Japan\'s worst political corruption scandal in decades\nParty bosses fall in Japan\'s worst political corruption scandal in decades\nEnlarge this image toggle caption Kyodo via Reuters Connect Kyodo via Reuters Connect\nSEOUL, South Korea — Japan\'s governing Liberal Democratic Party (LDP) on Friday replaced two of its top executives, as part of a purge related to the worst corruption scandal to rock the country in three decades.\nThe outgoing executives were in charge of policy and parliamentary affairs. They belonged to an LDP faction — a sort of party within a party — formerly led by the late ex-Prime Minister Shinzo Abe.\nJapan\'s current prime minister, Fumio Kishida, has been sacking party chiefs and Cabinet members to save his administration, as prosecutors investigate allegations that LDP ministers and lawmakers violated political finance laws.\n"Japanese democracy\'s strength is going to be tested," says Hitoshi Tanaka, a former diplomat and special advis

In [10]:
blob = TextBlob(article_text)

blob.sentiment

Sentiment(polarity=-0.0023160173160173144, subjectivity=0.39767316017316023)

In [11]:
candidate_labels = ["Politics", "Economy", "Environment", 'Others']
classes_class(article_text, candidate_labels, multi_label=True)['scores'], classes_class(article.description, candidate_labels, multi_label=True)['scores']

([0.3441440463066101,
  0.21763548254966736,
  0.20591646432876587,
  0.13564704358577728],
 [0.8734523057937622,
  0.48352286219596863,
  0.15992116928100586,
  0.012142530642449856])

In [12]:
from sklearn.cluster import KMeans
import numpy as np

candidate_labels = ["Politics", "Economy", "Environment", 'Others']

output = classes_class(
    '''
    Machine Learning is a great job
    '''
    , candidate_labels, multi_label=True)

data_train = np.array(output['scores']).reshape(-1, 1)

kmeans = KMeans(n_clusters=2, n_init=10)

# Fit the model
kmeans.fit(data_train)

# Cluster labels
labels = kmeans.labels_

# Calculate the average value of each cluster
cluster_averages = [data_train[labels == i].mean() for i in range(2)]

# Identify the cluster with the higher average value
higher_avg_cluster = np.argmax(cluster_averages)

# Indices of elements in the higher data cluster
indices_higher_cluster = np.where(labels == higher_avg_cluster)[0]

print("Indices of elements in the cluster with higher data:", indices_higher_cluster)

Indices of elements in the cluster with higher data: [0]


In [119]:
output

{'sequence': '\n    Machine Learning is a great job\n    ',
 'labels': ['Others', 'Economy', 'Environment', 'Politics'],
 'scores': [0.020331770181655884,
  0.0007957927300594747,
  0.00040434286347590387,
  0.00037566403625532985]}

In [32]:
from sklearn.cluster import KMeans
import numpy as np

candidate_labels = ["Politics", "Economy", "Environment", 'Others']

output = classes_class(
    '''
    Machine Learning is going to boom the would economy, especially data science and the new natural language processing models
    '''
    , candidate_labels, multi_label=True)

data_train = np.array(output['scores']).reshape(-1, 1)

kmeans = KMeans(n_clusters=2, n_init=10)

# Fit the model
kmeans.fit(data_train)

# Cluster labels
labels = kmeans.labels_

# Calculate the average value of each cluster
cluster_averages = [data_train[labels == i].mean() for i in range(2)]
higher_avg_cluster = np.argmax(cluster_averages)
indices_higher_cluster = np.where(labels == higher_avg_cluster)[0]

index_low_cluster_start = indices_higher_cluster[-1] + 1
valid = []
MAX_SELECTED = 2
for i, label in enumerate(output['labels']):
    if i < index_low_cluster_start:
        if i < MAX_SELECTED:
            if label != "Others":
                valid.append(label)
            else:
                break

In [33]:
valid, output

(['Economy'],
 {'sequence': '\n    Machine Learning is going to boom the would economy, especially data science and the new natural language processing models\n    ',
  'labels': ['Economy', 'Others', 'Environment', 'Politics'],
  'scores': [0.5153793096542358,
   0.012050898745656013,
   0.00041549428715370595,
   0.00041548116132616997]})

In [120]:
output

{'sequence': '\n    Machine Learning is a great job\n    ',
 'labels': ['Others', 'Economy', 'Environment', 'Politics'],
 'scores': [0.020331770181655884,
  0.0007957927300594747,
  0.00040434286347590387,
  0.00037566403625532985]}

In [122]:
from sklearn.cluster import KMeans
import numpy as np

candidate_labels = ["Politics", "Economy", "Environment", 'Others']

output = classes_class(
    '''
    Machine Learning is a great job
    '''
    , candidate_labels, multi_label=True)

data_train = np.array(output['scores']).reshape(-1, 1)

kmeans = KMeans(n_clusters=2, n_init=10)

# Fit the model
kmeans.fit(data_train)

# Cluster labels
labels = kmeans.labels_

# Calculate the average value of each cluster
cluster_averages = [data_train[labels == i].mean() for i in range(2)]

# Identify the cluster with the higher average value
higher_avg_cluster = np.argmax(cluster_averages)

# Indices of elements in the higher data cluster
indices_higher_cluster = np.where(labels == higher_avg_cluster)[0]


labels = np.array(output['labels'])
scores = np.array(output['scores'])
others_index = np.where(output['labels'] == 'Others')
if labels[indices_higher_cluster][0] == "Others":
    print("No category")
else:
    for i in range(len(output['labels'])):
        if "Others" in labels[indices_higher_cluster]:
            if labels[np.argmax(scores[indices_higher_cluster])] == "Others":
                print("No category, right cluster but argmax is Others")

No category


  others_index = np.where(output['labels'] == 'Others')


In [41]:
classes_class(article.description, candidate_labels_2[:10], multi_label=True)['scores']

[0.9211400151252747,
 0.6830674409866333,
 0.2604702413082123,
 0.1896524429321289,
 0.16527467966079712,
 0.1205194815993309,
 0.09266605228185654,
 0.07585158944129944,
 0.017313361167907715,
 0.006902643013745546]

## Download models

In [54]:
emotions_class.model.save_pretrained('./models/emotions_classifier')
emotions_class.tokenizer.save_pretrained('./models/emotions_classifier')

classes_class.model.save_pretrained('./models/classes_classifier')
classes_class.tokenizer.save_pretrained('./models/classes_classifier')

('./models/classes_classifier\\tokenizer_config.json',
 './models/classes_classifier\\special_tokens_map.json',
 './models/classes_classifier\\spm.model',
 './models/classes_classifier\\added_tokens.json',
 './models/classes_classifier\\tokenizer.json')

In [55]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the emotion classification model
emotion_model_path = './models/emotions_classifier'

# Load the zero-shot classification model
classification_model_path = './models/classes_classifier'

In [58]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_path)
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_path)
emotions_class = pipeline("text-classification", model=emotion_model, tokenizer=emotion_tokenizer, return_all_scores=True)

classification_tokenizer = AutoTokenizer.from_pretrained(classification_model_path)
classification_model = AutoModelForSequenceClassification.from_pretrained(classification_model_path)
classes_class = pipeline("zero-shot-classification", model=classification_model, tokenizer=classification_tokenizer)




In [86]:
article.date_download, article.date_publish

(datetime.datetime(2023, 12, 24, 17, 1, 20),
 datetime.datetime(2023, 12, 24, 6, 24, 9))

In [84]:
dir(article)[-20:]

['__subclasshook__',
 '__weakref__',
 'authors',
 'date_download',
 'date_modify',
 'date_publish',
 'description',
 'filename',
 'get_dict',
 'get_serializable_dict',
 'image_url',
 'language',
 'localpath',
 'maintext',
 'source_domain',
 'text',
 'title',
 'title_page',
 'title_rss',
 'url']

## NewsScraper

In [None]:
article = NewsPlease.from_url("https://www.npr.org/2023/12/22/1221230635/japan-alleged-political-corruption-ldp-slush-fund")

In [77]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs

# URL of the Google News page you want to scrape
url = 'https://news.google.com/topstories?hl=en-US&gl=US&ceid=US:en'

# Headers to simulate a real user visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')

for link in soup.find_all('article'):
    a_tag = link.find('a', href=True)
    if a_tag and 'href' in a_tag.attrs:
        # Get the partial URL
        partial_url = a_tag.attrs['href']
        # Construct the full Google News URL
        google_news_url = f'https://news.google.com{partial_url}'

        response = requests.get(google_news_url, allow_redirects=True, timeout=10)
        
        # Get the final URL after redirection
        final_url = response.url
        
        print(final_url)

https://www.bbc.com/news/world-middle-east-67814475
https://www.wsj.com/world/middle-east/israeli-soldier-death-toll-grows-as-hamas-shifts-to-guerilla-attacks-dee2bc9b
https://www.timesofisrael.com/idf-says-8-more-soldiers-killed-amid-heavy-fighting-in-south-central-gaza-saturday/
https://www.nytimes.com/live/2023/12/24/world/israel-hamas-war-gaza-news
https://www.clickorlando.com/news/local/2023/12/24/victim-idd-person-of-interest-revealed-after-deadly-shooting-at-mall-in-ocala/
https://www.fox35orlando.com/news/active-shooter-reported-at-ocala-mall-police-say
https://www.usatoday.com/story/news/nation/2023/12/24/shooting-erupts-ocala-florida-mall/72024171007/
https://www.orlandosentinel.com/2023/12/23/a-man-is-killed-and-a-woman-injured-in-a-targeted-afternoon-shooting-at-a-florida-shopping-mall/
https://thehill.com/policy/defense/4375673-iranian-drone-struck-chemical-tanker-in-indian-ocean-pentagon/
https://apnews.com/article/bethlehem-christmas-israel-hamas-war-e408a3e48d18c69a7a0b

In [78]:
article = NewsPlease.from_url("https://www.bbc.com/news/world-middle-east-67814475")

In [82]:
article.description

'The Israeli PM says his troops will keep fighting - after one of their deadliest days of the conflict.'