In [1]:
import time
import joblib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import (
    ConfusionMatrixDisplay, 
    precision_recall_curve, 
    roc_curve, 
    auc
)
from traditional_split_data import data_split

from preprocess_data_legacy_pipeline import load_data, prepare_date_for_model, prepare_text_classic, prepare_context_generic, add_count_proportions, process_subject, remove_unused_columns

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
models = {
'XGBoost'   : joblib.load('xgboost_pipeline.joblib')
}

df = load_data('liar2_no_null_data.csv')

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



In [3]:
from sklearn.model_selection import train_test_split

# train_test_split hace un muestreo estratificado
_, df_stratified = train_test_split(
    df,
    test_size=15,
    stratify=df['label'],
    random_state=200900
)

print(df_stratified)

df_stratified = df_stratified.drop(columns=['label'])

df_stratified.head(15)

          id  label                                          statement  \
20368   1786      4  The addition of a 1 percent sales tax would co...   
8157   22312      0  The Michigan attorney general's race was "stol...   
18997  10823      5  Sen. Sanders did vote five times against the B...   
10984  15845      0  Right here (in West Virginia), they are buildi...   
13676  18420      1  There's no court session between now and the e...   
13196  19551      3  The proposal by President Joe Biden would put ...   
10770  13846      2  In the Alabama Senate runoff, "the candidate I...   
10741   3097      1  The government will "go out and buy my breast ...   
14969  22059      2  Lee County, Florida, wasn't "even in the cone"...   
17105   1057      3  Medicare has at least $80 billion worth of fra...   
17596  17115      1  Says the United Nations plans to "implant ever...   
21953  22925      1                    Israel is banning Christianity.   
10366   8372      3  In 2002, "Iran wa

Unnamed: 0,id,statement,date,subject,speaker,speaker_description,true_counts,mostly_true_counts,half_true_counts,mostly_false_counts,false_counts,pants_on_fire_counts,context,justification
20368,1786,The addition of a 1 percent sales tax would co...,"June 18, 2010",economy;energy;infrastructure;transportation;u...,hillsborough area regional transit,"Hillsborough Area Rapid Transit, or HART, is t...",0,1,0,0,0,0,a brochure,“The real number is probably somewhere in betw...
8157,22312,"The Michigan attorney general's race was ""stol...","November 9, 2022",elections;facebook fact-checks,instagram posts,"Posters on Instagram, where people share photo...",3,3,15,51,405,140,an Instagram post,"The site showed Nessel with 2,320,440 votes, a..."
18997,10823,Sen. Sanders did vote five times against the B...,"October 13, 2015",guns,hillary clinton,Hillary Clinton was the 2016 Democratic nomine...,72,76,70,43,31,9,the first Democratic debate,"Clinton said, ""Sen. Sanders did vote five time..."
10984,15845,"Right here (in West Virginia), they are buildi...","March 5, 2019",immigration;housing;race and ethnicity,facebook posts,Posters on Facebook and other social media net...,24,50,108,245,1410,570,posts on the Internet,"The 2017 video said that in West Virginia, ""th..."
13676,18420,There's no court session between now and the e...,"September 20, 2020",supreme court,joe biden,Joe Biden is the president of the United State...,25,64,65,52,55,7,Philadelphia,"Biden said, ""There's no court session between ..."
13196,19551,The proposal by President Joe Biden would put ...,"April 4, 2021",corporations;taxes,roy blunt,Roy Blunt is the junior U.S. senator from Miss...,5,2,4,3,3,1,"remarks on ""Fox News Sunday""",Blunt said Biden’s proposal would put the U.S....
10770,13846,"In the Alabama Senate runoff, ""the candidate I...","September 30, 2017",polls and public opinion,donald trump,Donald Trump is the former president of the Un...,36,85,120,194,360,179,a tweet,Trump tweeted that in the recent Alabama Senat...
10741,3097,"The government will ""go out and buy my breast ...","February 15, 2011",health care;women;taxes,michele bachmann,"Michele Bachmann, first elected to the U.S. Co...",5,4,6,8,23,16,"an interview on ""The Laura Ingraham Show""","It’s a catchy sound bite, but like a lot of so..."
14969,22059,"Lee County, Florida, wasn't ""even in the cone""...","October 2, 2022",environment,ron desantis,"Ron DeSantis announced on May 24, 2023 that he...",2,10,9,13,14,2,an interview with CNN,Most of Lee County was not within Hurricane Ia...
17105,1057,Medicare has at least $80 billion worth of fra...,"August 24, 2009",health care;medicare,tom coburn,Tom Coburn is a former U.S. Senator from Oklah...,0,2,3,2,3,0,Fox News Channel's On the Record With Greta Va...,"He added that Coburn ""doesn't know any more th..."


In [4]:
from dataclasses import dataclass
from datetime import datetime

@dataclass
class Noticia:
    title: str
    source: str
    date: datetime
    veracity: int
    image: str
    url: str
    explanation: str = ""

In [5]:
noticias = []


for _, data in df_stratified.iterrows():
    
    fecha = pd.to_datetime(data['date'], format='%B %d, %Y', errors='coerce').strftime('%Y-%m-%d')

    noticia = Noticia(
        title=data['statement'],
        source=data['context'].lower().capitalize(),
        date=fecha,
        veracity=0,
        image="",                 
        url="",                   
        explanation=""
    )
    noticias.append(noticia)

print(noticias)

[Noticia(title='The addition of a 1 percent sales tax would cost a typical household in Hillsborough County approximately $12 a month.', source='A brochure', date='2010-06-18', veracity=0, image='', url='', explanation=''), Noticia(title='The Michigan attorney general\'s race was "stolen in the middle of the night from Matt DePerno.', source='An instagram post', date='2022-11-09', veracity=0, image='', url='', explanation=''), Noticia(title='Sen. Sanders did vote five times against the Brady bill.', source='The first democratic debate', date='2015-10-13', veracity=0, image='', url='', explanation=''), Noticia(title='Right here (in West Virginia), they are building new houses … so that 321 Syrian Muslims can move down into this neighborhood in two months. … (A funeral home) is to be torn down in two months, and a mosque is going to built there.', source='Posts on the internet', date='2019-03-05', veracity=0, image='', url='', explanation=''), Noticia(title="There's no court session betw

In [6]:
df_stratified = prepare_date_for_model(df_stratified)
df_stratified = prepare_text_classic(df_stratified, ['statement','speaker_description','justification'])
df_stratified = prepare_context_generic(df_stratified)
df_stratified = add_count_proportions(df_stratified)
df_stratified = process_subject(df_stratified)
df_stratified = remove_unused_columns(df_stratified)

df_stratified.head(20)

Unnamed: 0,subject,year,month,day,dayofweek,clean_statement,statement_len_chars,statement_len_words,clean_speaker_description,speaker_description_len_chars,...,ctx_social_media,ctx_verbal_event,ctx_location,total_counts,true_prop,mostly_true_prop,half_true_prop,mostly_false_prop,false_prop,pants_on_fire_prop
20368,economy energy infrastructure transportation u...,2010,6,18,4,addition percent sale tax would cost typical h...,94,12,hillsborough area rapid transit hart agency op...,83,...,False,False,False,1,0.0,1.0,0.0,0.0,0.0,0.0
8157,elections facebook_fact_checks,2022,11,9,2,michigan attorney general race stolen middle n...,63,9,poster instagram people share photo graphic so...,64,...,True,False,False,617,0.004862,0.004862,0.024311,0.082658,0.656402,0.226904
18997,guns,2015,10,13,1,sen sander vote five time brady bill,36,7,hillary clinton democratic nominee president s...,340,...,False,True,False,301,0.239203,0.252492,0.232558,0.142857,0.10299,0.0299
10984,immigration housing race_and_ethnicity,2019,3,5,1,right west virginia building new house syrian ...,127,20,poster facebook social medium network,37,...,True,False,False,2407,0.009971,0.020773,0.044869,0.101786,0.585791,0.236809
13676,supreme_court,2020,9,20,6,there court session end election,32,5,joe biden president united state democrat bide...,389,...,False,False,False,268,0.093284,0.238806,0.242537,0.19403,0.205224,0.026119
13196,corporations taxes,2021,4,4,6,proposal president joe biden would put u world...,73,12,roy blunt junior u senator missouri formerly u...,122,...,False,True,False,18,0.277778,0.111111,0.222222,0.166667,0.166667,0.055556
10770,polls_and_public_opinion,2017,9,30,5,alabama senate runoff candidate endorsed luthe...,83,11,donald trump former president united state ele...,232,...,True,False,False,974,0.036961,0.087269,0.123203,0.199179,0.36961,0.183778
10741,health_care women taxes,2011,2,15,1,government go buy breast pump baby,34,6,michele bachmann first elected u congress repr...,302,...,False,True,False,62,0.080645,0.064516,0.096774,0.129032,0.370968,0.258065
14969,environment,2022,10,2,6,lee county florida wasnt even cone hurricane i...,62,10,ron desantis announced may running president r...,338,...,False,True,False,50,0.04,0.2,0.18,0.26,0.28,0.04
17105,health_care medicare,2009,8,24,0,medicare least billion worth fraud year thats ...,101,16,tom coburn former u senator oklahoma physician,46,...,False,False,False,10,0.0,0.2,0.3,0.2,0.3,0.0


In [7]:
y_pred = models['XGBoost'].predict(df_stratified)

# Mapeo de índices a valores
mapping = np.array([0, 20, 40, 50, 75, 100])

# Aplicar el mapeo y luego multiplicar por 0.15
raw = mapping[y_pred] - 15
mapped = np.clip(raw, 0, None)

print(mapped)

[60  5 85  5 85 35  5  5 25 60  0  5  5 60 85]


In [31]:
import pandas as pd

# Función para humanizar dinámicamente nombres de features
def humanize_feature(name):
    # Mapas de prefijos a plantillas
    prefix_map = {
        'nums__': lambda m: (
            f"proportion of {m[:-5].replace('_', ' ')} statements"
            if m.endswith('_prop') else
            (f"length of {m[:-10].replace('_', ' ')} in characters"
             if m == m.endswith('_len_chars') else
             f"length of {m[:-10].replace('_', ' ')} in words"
             if m == m.endswith('_len_words') else
             "total number of words")
        ),
        'justf__tfidf_word__': lambda m: f'the word token "{m}" in the justification',
        'justf__tfidf_char__': lambda m: f'the character token "{m}" in the justification',
        'spkdesc__tfidf_word__': lambda m: f'the word token "{m}" in the speaker description',
        'spkdesc__tfidf_char__': lambda m: f'the character token "{m}" in the speaker description',
        'stmt__tfidf_word__': lambda m: f'the word token "{m}" in the statement',
        'stmt__tfidf_char__': lambda m: f'the character token "{m}" in the statement',
        'subj__tfidf_char__': lambda m: f'the character token "{m}" in the subject',
        'subj__tfidf_word__': lambda m: f'the word token "{m}" in the subject',
        'cats__': lambda m: f"category '{m.replace('_', ' ')}'"
    }
    for prefix, template in prefix_map.items():
        if name.startswith(prefix):
            metric = name[len(prefix):]
            return template(metric)
    # Fallback genérico
    return name.replace('__', ' ').replace('_', ' ')

In [65]:
import numpy as np
import shap
import pandas as pd

X_train, X_val, X_test, y_train, y_val, y_test = data_split()

vectorizer = models['XGBoost'].named_steps['pre']
clf        = models['XGBoost'].named_steps['clf']

# Transform and cache X_train as a 2D array for the explainer
X_train_vec = vectorizer.transform(X_train)
X_train_arr = X_train_vec.toarray().astype(float)

explainer = shap.TreeExplainer(
    clf,
    data=X_train_arr,
    feature_perturbation='interventional'
)

labels = ['Pants on fire', 'False', 'Barely-true', 'Half-true', 'Mostly-true', 'True']
all_frases = []
c = 0

for idx, row in df_stratified.iterrows():
    # 2.1) Extract the raw document/text for this row
    #     (adjust 'statement' to whatever your text-column is called)
    doc = row['clean_statement']
    
    X_test_vec = vectorizer.transform(df_stratified.loc[[idx]])
    X_test_arr = X_test_vec.toarray().astype(float)
    
    pred_class = clf.predict(X_test_arr)[0]

    # 2.4) Compute SHAP values for that one sample
    sv = explainer.shap_values(X_test_arr, check_additivity=False)
    # sv has shape (1, n_features, n_classes)
    vals = sv[0, :, pred_class]
    
    # 2.5) Build (feature_name, shap_value) pairs and sort by absolute impact
    feature_names = vectorizer.get_feature_names_out()
    shap_contribs = list(zip(feature_names, vals))
    shap_contribs.sort(key=lambda x: abs(x[1]), reverse=True)
    
    # 2.6) Humanize the top-5 and store the English sentences
    frases = []
    for feat, val in shap_contribs[:5]:
        desc  = humanize_feature(feat)
        signo = "positively" if val>0 else "negatively"
        frases.append(
            f'The presence of "{desc}" influence {signo} the probability of the {labels[y_pred[c]]} class'
        )

    c+=1
    
    all_frases.append(frases)


In [66]:
all_frases

[['The presence of "proportion of mostly false statements" influence negatively the probability of the Mostly-true class',
  'The presence of "the character token "ill" in the speaker description" influence positively the probability of the Mostly-true class',
  'The presence of "the character token "tax" in the statement" influence positively the probability of the Mostly-true class',
  'The presence of "the character token "ica" in the justification" influence negatively the probability of the Mostly-true class',
  'The presence of "the character token " sta" in the justification" influence negatively the probability of the Mostly-true class'],
 ['The presence of "proportion of mostly false statements" influence positively the probability of the False class',
  'The presence of "the character token "ica" in the justification" influence negatively the probability of the False class',
  'The presence of "the character token " sta" in the justification" influence negatively the probabil

In [62]:
urlList = [
'http://www.gohart.org/whytransit/rapid_transit_solutions.pdf',
'https://www.instagram.com/p/CkuwF1BuNWI/',
"https://www.politifact.com/factchecks/2020/mar/09/joe-biden/did-bernie-sanders-vote-against-brady-bill-five-ti/",
"https://www.facebook.com/jason.j.britt/videos/1332903936746386/",
"https://www.factcheck.org/2020/09/bidens-false-and-exaggerated-supreme-court-claims/",
"https://www.youtube.com/watch?v=9xp15zEcWUE&ab_channel=RoyBlunt",
"https://x.com/realDonaldTrump/status/914269704440737792",
"https://www.tampabay.com/archive/2011/02/18/michele-bachmann-says-the-government-will-buy-you-a-breast-pump-for-your-baby/",
"https://www.washingtonpost.com/politics/2022/10/03/ian-evacuation-lee-county-desantis-fema/",
"http://www.foxnews.com/story/0,2933,542384,00.html",
"https://www.m2sys.com/blog/biometric-resources/the-un-plans-to-implant-everyone-with-a-biometric-id-by-2030/",
"https://www.theguardian.com/world/2023/apr/13/christians-are-in-danger-under-israeli-government-says-holy-land-patriarch",
"https://www.breitbart.com/the-media/2014/01/28/nbc-andrea-mitchell-iran-an-american-ally-in-war-on-terror-until-bush-blew-it/",
"https://www.nytimes.com/2011/07/24/opinion/sunday/24sun4.html?_r=1",
"https://www.governor.ny.gov/news/video-rush-transcript-governor-hochul-guest-cnn-news-central"
]

imageList = [
'https://www.floridaforboomers.com/wp-content/uploads/2023/05/On-the-Water-in-Naples-Florida-1024x576.jpg',
'https://pbs.twimg.com/media/FhIKeSKXEAAQvNW?format=jpg&name=medium',
"https://static01.nyt.com/images/2020/02/25/us/politics/25breakout-sanders-guns/25breakout-sanders-guns-videoSixteenByNineJumbo1600.jpg",
"https://www.100daysinappalachia.com/wp-content/uploads/2019/03/0331_Screen-Shot-2019-03-18-at-8.13.48-AM-1024x538.png",
"https://ncnewsline.com/wp-content/uploads/2024/11/Riggs-Griffin-900.jpg",
"https://i.ytimg.com/vi/_5IsG7l7IXM/maxresdefault.jpg",
"https://static01.nyt.com/images/2017/12/14/us/politics/14dc-factcheck/merlin_127757144_06d0575b-61ae-4f75-b951-8388f11c5dac-articleLarge.jpg?quality=75&auto=webp&disable=upscale",
"https://cdnph.upi.com/sv/ph/og/upi/19971300229673/2011/1/406e4f78f16a30639665a2f710288a2e/v1.5/Bachmann-says-media-overplays-her-gaffes.jpg",
"https://www.tallahassee.com/gcdn/presto/2022/10/05/PTAL/90e56b62-4abc-46ce-9435-641256ec35aa-Hurricane_Cone_at_11.jpg?width=897&height=736&fit=crop&format=pjpg&auto=webp",
"https://www.gao.gov/assets/extracts/a5436ee7d20cf3ec8e741f872e346440/rId17_image3.png",
"https://www.un.org/sites/un2.un.org/files/world-leaders-adopt-pivotal-un-pact-for-the-future.jpg",
"https://images.csmonitor.com/csm/2023/04/0414%20OJLMCHRISTIANS%20crosses.jpg?alias=standard_900x600",
"https://media.breitbart.com/media/cdn/mediaserver/Breitbart/Big-Journalism/2012/MSNBC/Andrea%20Mitchell.jpg",
"https://i.ytimg.com/vi/K0UXnARjd3g/sddefault.jpg",
"https://static.politifact.com/CACHE/images/politifact/photos/HochulSuffolk050525/4780ba46f548ba4c7585329b3ae45d78.jpg"
]

In [67]:
df_noticias = pd.DataFrame([noticia.__dict__ for noticia in noticias])

df_noticias['veracity'] = mapped

df_noticias['explanation'] = all_frases

df_noticias['image'] = imageList

df_noticias['url'] = urlList
# y luego:
df_noticias.head(20)

Unnamed: 0,title,source,date,veracity,image,url,explanation
0,The addition of a 1 percent sales tax would co...,A brochure,2010-06-18,60,https://www.floridaforboomers.com/wp-content/u...,http://www.gohart.org/whytransit/rapid_transit...,"[The presence of ""proportion of mostly false s..."
1,"The Michigan attorney general's race was ""stol...",An instagram post,2022-11-09,5,https://pbs.twimg.com/media/FhIKeSKXEAAQvNW?fo...,https://www.instagram.com/p/CkuwF1BuNWI/,"[The presence of ""proportion of mostly false s..."
2,Sen. Sanders did vote five times against the B...,The first democratic debate,2015-10-13,85,https://static01.nyt.com/images/2020/02/25/us/...,https://www.politifact.com/factchecks/2020/mar...,"[The presence of ""proportion of mostly false s..."
3,"Right here (in West Virginia), they are buildi...",Posts on the internet,2019-03-05,5,https://www.100daysinappalachia.com/wp-content...,https://www.facebook.com/jason.j.britt/videos/...,"[The presence of ""proportion of mostly false s..."
4,There's no court session between now and the e...,Philadelphia,2020-09-20,85,https://ncnewsline.com/wp-content/uploads/2024...,https://www.factcheck.org/2020/09/bidens-false...,"[The presence of ""proportion of mostly false s..."
5,The proposal by President Joe Biden would put ...,"Remarks on ""fox news sunday""",2021-04-04,35,https://i.ytimg.com/vi/_5IsG7l7IXM/maxresdefau...,https://www.youtube.com/watch?v=9xp15zEcWUE&ab...,"[The presence of ""proportion of mostly false s..."
6,"In the Alabama Senate runoff, ""the candidate I...",A tweet,2017-09-30,5,https://static01.nyt.com/images/2017/12/14/us/...,https://x.com/realDonaldTrump/status/914269704...,"[The presence of ""proportion of mostly false s..."
7,"The government will ""go out and buy my breast ...","An interview on ""the laura ingraham show""",2011-02-15,5,https://cdnph.upi.com/sv/ph/og/upi/19971300229...,https://www.tampabay.com/archive/2011/02/18/mi...,"[The presence of ""proportion of mostly false s..."
8,"Lee County, Florida, wasn't ""even in the cone""...",An interview with cnn,2022-10-02,25,https://www.tallahassee.com/gcdn/presto/2022/1...,https://www.washingtonpost.com/politics/2022/1...,"[The presence of ""proportion of mostly false s..."
9,Medicare has at least $80 billion worth of fra...,Fox news channel's on the record with greta va...,2009-08-24,60,https://www.gao.gov/assets/extracts/a5436ee7d2...,"http://www.foxnews.com/story/0,2933,542384,00....","[The presence of ""proportion of mostly false s..."


In [68]:
import json

json_str = df_noticias.to_json(
    orient='records',
    force_ascii=False,    
    date_format='iso'     
)

print(json_str)

with open('noticias.json', 'w', encoding='utf-8') as f:
    f.write(json_str)

[{"title":"The addition of a 1 percent sales tax would cost a typical household in Hillsborough County approximately $12 a month.","source":"A brochure","date":"2010-06-18","veracity":60,"image":"https:\/\/www.floridaforboomers.com\/wp-content\/uploads\/2023\/05\/On-the-Water-in-Naples-Florida-1024x576.jpg","url":"http:\/\/www.gohart.org\/whytransit\/rapid_transit_solutions.pdf","explanation":["The presence of \"proportion of mostly false statements\" influence negatively the probability of the Mostly-true class","The presence of \"the character token \"ill\" in the speaker description\" influence positively the probability of the Mostly-true class","The presence of \"the character token \"tax\" in the statement\" influence positively the probability of the Mostly-true class","The presence of \"the character token \"ica\" in the justification\" influence negatively the probability of the Mostly-true class","The presence of \"the character token \" sta\" in the justification\" influence