In [1]:
# Tratamiento de los datos brutos de noticias

In [2]:
fec_inicio = "2019-01-01"
fec_fin = "2025-01-01"

# Rutas

In [3]:
path_datos = "gs://bucket-tfm-llc/datos"
# path_gkg = "gs://bucket-tfm-llc/gkg"

# Librerias

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
from transformers import pipeline
import time

2025-06-15 09:10:52.765383: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as FT
from tqdm import tqdm
import pandas as pd

# Lectura datos

In [6]:
df_noticias_raw = spark.read.parquet(f"{path_datos}/datos_noticias_raw/*")

                                                                                

In [7]:
df_fec_financiera = spark.read.parquet(f"{path_datos}/datos_financieros_amzn_trat").select("date").\
                          filter(F.col("date") >= fec_inicio).filter(F.col("date") < fec_fin).distinct()

In [8]:
df_fec_financiera.select(F.min("date"), F.max("date")).show()



+-------------------+-------------------+
|          min(date)|          max(date)|
+-------------------+-------------------+
|2019-01-02 00:00:00|2024-12-31 00:00:00|
+-------------------+-------------------+



                                                                                

# Funciones

In [9]:
import re

def extract_page_title(var):
    if var is None:
        return None
    match = re.search(r"<PAGE_TITLE>(.*?)</PAGE_TITLE>$", var)
    if match:
        return match.group(1).strip()
    return None

# Tratamiento inicial

In [10]:
# df_noticias_raw.show(n=2)

In [11]:
vars_selected = ['GKGRECORDID', 'DATE', 'SourceCommonName', 'DocumentIdentifier', 'Themes', 'Locations', 'Persons', 'Organizations', 'V2Organizations', 'V2Tone','Extras']
df_noticias_raw.select(*vars_selected).show(2)

[Stage 5:>                                                          (0 + 1) / 1]

+------------------+--------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       GKGRECORDID|          DATE|   SourceCommonName|  DocumentIdentifier|              Themes|           Locations|             Persons|       Organizations|     V2Organizations|              V2Tone|              Extras|
+------------------+--------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  20210202220000-7|20210202220000|channelnewsasia.com|https://www.chann...|TAX_FNCACT;TAX_FN...|                null|andy jassy;jeff b...| amazon web services|Amazon Web Servic...|0.980392156862745...|<PAGE_TITLE>Amazo...|
|20210202220000-169|20210202220000|businessinsider.com|https://www.busin...|TAX_FNCACT;TAX_FN...|3#Washi

                                                                                

In [12]:
# Creamos las variables tratadas de fecha y extraemos el título de la página:
df_noticias_fil = df_noticias_raw.select(*vars_selected).\
        withColumn("year", F.substring("DATE",1,4)).\
        withColumn("month", F.substring("DATE",5,2)).\
        withColumn("day", F.substring("DATE",7,2)).\
        withColumn("hour", F.substring("DATE",9,2)).\
        withColumn("date_trat", F.to_date(F.concat_ws("-",F.col("year"),F.col("month"),F.col("day")),"yyyy-MM-dd")).\
    withColumn("page_title", F.regexp_extract(F.col("Extras"), "<PAGE_TITLE>(.*?)</PAGE_TITLE>", 1))

In [13]:
# Nº de noticias sin título disponible:
df_noticias_fil.filter(F.col("page_title").isNull()).count()

                                                                                

200

In [14]:
# Noticias por año:
df_noticias_fil.groupBy("year").agg(F.count("*").alias("num_noticias"), F.countDistinct("date_trat").alias("num_dias")).orderBy("year").show()



+----+------------+--------+
|year|num_noticias|num_dias|
+----+------------+--------+
|2019|        1311|     245|
|2020|         904|     234|
|2021|        1147|     236|
|2022|         970|     230|
|2023|        1125|     237|
|2024|        1047|     243|
+----+------------+--------+



                                                                                

In [15]:
print(f"Nº fechas total: ", df_fec_financiera.count())
print(f"Nº fechas con noticias: ", df_noticias_fil.select("date_trat").distinct().count())

Nº fechas total:  1510




Nº fechas con noticias:  1425


                                                                                

# Análisis sentimiento

## Tratamiento V2Tone
Hacemos el tratamiento del campo V2Tone para extraer la información contenida en él:

In [16]:
df_noticias_trat = df_noticias_fil.withColumn("V2Tone_aux", F.split(F.col("V2Tone"),",")).\
            withColumn("global_tone", F.col("V2Tone_aux")[0].cast("float")).\
            withColumn("positive_score", F.col("V2Tone_aux")[1].cast("float")).\
            withColumn("negative_score", F.col("V2Tone_aux")[2].cast("float")).\
            withColumn("polarity", F.col("V2Tone_aux")[3].cast("float")).\
            withColumn("activity_reference_density", F.col("V2Tone_aux")[4].cast("float")).\
            withColumn("group_reference_density", F.col("V2Tone_aux")[5].cast("float")).\
            withColumn("word_count", F.col("V2Tone_aux")[6].cast("int")).\
        drop("V2Tone_aux")

In [17]:
df_noticias_trat.select("global_tone","positive_score","negative_score","polarity").summary().show()



+-------+------------------+------------------+------------------+------------------+
|summary|       global_tone|    positive_score|    negative_score|          polarity|
+-------+------------------+------------------+------------------+------------------+
|  count|              6504|              6504|              6504|              6504|
|   mean|1.0784933317365608|3.1403073944302613|2.0618140547849046| 5.202121446401724|
| stddev|2.7994334925810302|1.8244021011446925|1.7368863030296444|2.2030901569579733|
|    min|       -12.1693125|               0.0|               0.0|               0.0|
|    25%|       -0.34602076|         2.0242915|          0.933126|         3.6713288|
|    50%|         1.1782032|         2.6143792|          1.421801|          4.805195|
|    75%|         2.5742574|         3.9308176|         2.7863777|           6.47482|
|    max|         12.755102|         13.265306|         13.756614|         16.857143|
+-------+------------------+------------------+-------

                                                                                

## FinBert - Títulos de las noticias
Aplicamos análisis de sentimiento a los títulos de las noticias:

In [48]:
df_titulos_noticias = df_noticias_trat.select("GKGRECORDID","DATE","page_title").\
        filter(F.col("page_title").isNotNull()).\
        toPandas()

                                                                                

In [19]:
len(df_titulos_noticias)

6304

In [20]:
import psutil
print(f"Memoria disponible: {psutil.virtual_memory().available / 1024**2:.2f} MB")
# 6800.21 

Memoria disponible: 5496.34 MB


In [39]:
# Carga el modelo y tokenizer de FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [40]:
model.config.label2id

{'positive': 0, 'negative': 1, 'neutral': 2}

In [45]:
# Función para obtener los 3 scores
def get_finbert_scores(text):
    if pd.isnull(text) or not isinstance(text, str) or len(text.strip()) == 0:
        return (None, None, None)
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = FT.softmax(logits, dim=1)
        
        positive_score = probs[0][0].item()
        negative_score = probs[0][1].item()
        neutral_score = probs[0][2].item()
    return tuple([positive_score, negative_score, neutral_score])

In [46]:
get_finbert_scores("EXCLUSIVE Amazon starts testing UK staff for coronavirus variants")

(0.04278571158647537, 0.5052314400672913, 0.45198285579681396)

In [49]:
tqdm.pandas()
inicio = time.time()
df_titulos_noticias[["finbert_pos", "finbert_neg", "finbert_neu"]] = df_titulos_noticias["page_title"].progress_apply(lambda x: pd.Series(get_finbert_scores(x)))
fin = time.time()
print(f"Tiempo de ejecución: {((fin - inicio)/60):.4f} mintuos")

100%|██████████| 6304/6304 [07:54<00:00, 13.30it/s]

Tiempo de ejecución: 7.9010 mintuos





In [50]:
df_titulos_noticias

Unnamed: 0,GKGRECORDID,DATE,page_title,finbert_pos,finbert_neg,finbert_neu
0,20210202220000-7,20210202220000,Amazon's Bezos to step down from CEO role in t...,0.018390,0.422639,0.558971
1,20210202220000-169,20210202220000,Jeff Bezos will step down as Amazon CEO later ...,0.015970,0.405611,0.578419
2,20210202220000-271,20210202220000,Jeff Bezos to step down as Amazon CEO after re...,0.017405,0.516179,0.466415
3,20210202220000-344,20210202220000,Jeff Bezos is stepping down as Amazon CEO,0.018289,0.429903,0.551808
4,20210202220000-438,20210202220000,"Jeff Bezos stepping down as Amazon's CEO, will...",0.020910,0.117228,0.861862
...,...,...,...,...,...,...
6299,20240122220000-1850,20240122220000,Funky Taurus Media - Music Photo Agency & P...,0.027807,0.025194,0.946999
6300,20240508100000-1390,20240508100000,Funky Taurus Media - Music Photo Agency & P...,0.027807,0.025194,0.946999
6301,20240402130000-1681,20240402130000,NETSOL Technologies Achieves AWS Foundational ...,0.863743,0.008171,0.128087
6302,20240905130000-316,20240905130000,Amazon congratulates itself for AI code that m...,0.425772,0.014616,0.559612


### Otra forma:

In [51]:
# # Descargamos el modelo finbert:
# sentiment_pipeline = pipeline("sentiment-analysis",
#                             model="ProsusAI/finbert",
#                             tokenizer="ProsusAI/finbert",
#                             device=-1)

In [52]:
# print(f"Memoria disponible: {psutil.virtual_memory().available / 1024**2:.2f} MB")

In [53]:
# # Aplicamos análisis de sentimiento a cada título
# inicio = time.time()
# df_titulos_noticias["sentiment"] = df_titulos_noticias["page_title"].apply(lambda x: sentiment_pipeline(x)[0])
# fin = time.time()

In [54]:
# # Extraemos la información
# df_titulos_noticias["label"] = df_titulos_noticias["sentiment"].apply(lambda x: x["label"])
# df_titulos_noticias["score"] = df_titulos_noticias["sentiment"].apply(lambda x: x["score"])

# Output

In [57]:
df_final_sentimientos = spark.createDataFrame(df_titulos_noticias)

In [58]:
df_final_sentimientos.show(truncate=False)

+-------------------+--------------+-----------------------------------------------------------------------------------------------------+--------------------+--------------------+--------------------+
|GKGRECORDID        |DATE          |page_title                                                                                           |finbert_pos         |finbert_neg         |finbert_neu         |
+-------------------+--------------+-----------------------------------------------------------------------------------------------------+--------------------+--------------------+--------------------+
|20210202220000-7   |20210202220000|Amazon's Bezos to step down from CEO role in third quarter                                           |0.01838993839919567 |0.4226389229297638  |0.5589711666107178  |
|20210202220000-169 |20210202220000|Jeff Bezos will step down as Amazon CEO later this year                                              |0.015970265492796898|0.40561074018478394 |0.5784190297

In [59]:
df_output = df_noticias_trat.join(df_final_sentimientos.drop("page_title"), ["GKGRECORDID","DATE"], "left")

In [60]:
df_output.count()

                                                                                

6504

In [61]:
df_noticias_trat.count()

                                                                                

6504

In [62]:
df_output.write.mode("overwrite").format("parquet").save(f"{path_datos}/noticias_con_analisis_sentimiento")

25/06/15 09:50:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                