In [1]:
import pickle
import pandas as pd
import os
import matplotlib.pyplot as plt
import tldextract

In [6]:
col_names_mentions = ["GlobalEventID","EventTimeDate","MentionTimeDate","MentionType",
        "MentionSourceName","MentionIdentifier","SentenceID","Actor1CharOffset","Actor2CharOffset",
        "ActionCharOffset","InRawText","Confidence","MentionDocLen","MentionDocTone",
        "SRCLC", "ENG"] # mention columns

col_names_events = ["GlobalEventID","Day","MonthYear","Year","FractionDate",
                            "Actor1Code","Actor1Name","Actor1CountryCode","Actor1KnownGroupCode","Actor1EthnicCode",
                            "Actor1Religion1Code","Actor1Religion2Code","Actor1Type1Code","Actor1Type2Code","Actor1Type3Code",
                            "Actor2Code","Actor2Name","Actor2CountryCode","Actor2KnownGroupCode","Actor2EthnicCode",
                            "Actor2Religion1Code","Actor2Religion2Code","Actor2Type1Code","Actor2Type2Code","Actor2Type3Code",
                            "IsRootEvent","EventCode","EventBaseCode","EventRootCode","QuadClass",
                            "GoldsteinScale","NumMentions","NumSources","NumArticles","AvgTone",
                            "Actor1Geo_Type","Actor1Geo_Fullname","Actor1Geo_CountryCode","Actor1Geo_ADM1Code","Actor1Geo_ADM2Code",
                            "Actor1Geo_Lat","Actor1Geo_Long","Actor1Geo_FeatureID","Actor2Geo_Type","Actor2Geo_Fullname",
                            "Actor2Geo_CountryCode","Actor2Geo_ADM1Code","Actor2Geo_ADM2Code","Actor2Geo_Lat","Actor2Geo_Long",
                            "Actor2Geo_FeatureID","ActionGeo_Type","ActionGeo_Fullname","ActionGeo_CountryCode","ActionGeo_ADM1Code",
                            "ActionGeo_ADM2Code","ActionGeo_Lat","ActionGeo_Long","ActionGeo_FeatureID","DATEADDED",
                            "SOURCEURL"] # event columns

with open("urls_to_download_fr.pkl", 'rb') as file: # label of articles (0 if fake or not present in MBFS)
    df_article = pickle.load(file)
    
with open("labeled sources.pkl", 'rb') as file: # label of sources (0 if fake or not present in MBFS)
    df_sources = pickle.load(file)
    
with open("mixt_labeled_sources.pkl", 'rb') as file: # label of sources (nan if not present in MBFS)
    df_sources_mixte = pickle.load(file)
    
with open("mixt_labeled_articles.pkl", 'rb') as file: # label of articles (nan if not present in MBFS)
    df_articles_mixte = pickle.load(file)

MBFS = pd.read_csv("../mediabiasfactcheck/mediabiasfactcheck.csv")

df_test = pd.read_csv("gdelt_data/20231031214500.mentions.CSV", delimiter='\t', names=col_names_mentions)

df_test_event = pd.read_csv("gdelt_data_event/20231001000000.export.CSV", delimiter='\t', names=col_names_events)


In [8]:
pd.set_option('display.max_columns', None)

In [None]:
#TODO

# Data Analysis
    # number of articles by source and compare this stat with the fact that they could be fake
    # count the number of labelized elements in each relevant label dataset
    # number of csv mentions/events in october 2023
    # number of csv events in october 2023
    # number of articles/sources in total
    # number of nans in each columns
    # look at sources type in mention and evaluate impact
    
    
# Data Processing
    # remove irrelevant columns/nans?
    # put them in the correct format for graph


## Data Analysis

In [5]:
df_sources_mixte["is_fake"].value_counts(dropna = False) /len(df_sources_mixte["is_fake"])

is_fake
NaN    0.920925
0.0    0.076651
1.0    0.002423
Name: count, dtype: float64

In [7]:
df_articles_mixte["is_fake"].value_counts(dropna = False)/len(df_articles_mixte["is_fake"])

is_fake
NaN    0.744008
0.0    0.248798
1.0    0.007194
Name: count, dtype: float64

In [27]:
MBFS.isna().mean()

url                    0.043817
name                   0.000269
image_pseudoscience    0.900000
image_factual          0.081720
image_conspiracy       0.908333
image_bias             0.149462
freedom_rank           0.593817
country                0.546237
dtype: float64

In [56]:
df_sources_mixte[df_sources_mixte["is_fake"] == 1.0].head()

Unnamed: 0,links,is_fake
15,infiniteunknown.net,1.0
54,sott.net,1.0
102,dailymail.co.uk,1.0
131,wordpress.com,1.0
163,oann.com,1.0


In [104]:
df_articles_mixte[df_articles_mixte["links"].str.contains("oann.com")]

Unnamed: 0,links,is_fake
341,https://www.oann.com/newsroom/death-toll-rises...,1.0
688,https://www.oann.com/newsroom/man-accused-of-k...,1.0
29234,https://www.oann.com/newsroom/dem-rep-jamaal-b...,1.0
29364,https://www.oann.com/newsroom/girl-9-disappear...,1.0
36554,https://www.oann.com/newsroom/breonna-taylors-...,1.0
...,...,...
1908385,https://www.oann.com/newsroom/israeli-military...,1.0
1908605,https://www.oann.com/business/us-seeks-to-bloc...,1.0
1925791,https://www.oann.com/newsroom/person-of-intere...,1.0
1926135,https://www.oann.com/newsroom/fbi-chief-christ...,1.0


In [97]:
MBFS = MBFS.dropna(subset=['url'])
MBFS[MBFS["url"].str.contains("oann.com")]


Unnamed: 0,url,name,image_pseudoscience,image_factual,image_conspiracy,image_bias,freedom_rank,country
468,https://www.oann.com,One America News Network (OAN),,MBFCLow.png,,right.?.?.?.png,,


In [3]:
csv_count = 0
total = 0
nan = 0

for root, _, filenames in os.walk("gdelt_data"):
    for filename in filenames:
        if filename.endswith(".CSV"):
            csv_count += 1
            temp = pd.read_csv(os.path.join(root, filename), delimiter='\t', names=col_names_mentions)
            nan += temp.isna().mean()
            total+=len(temp)

print("csv_count for mentions: "+str(csv_count))
print("total number of lines for mentions: "+str(total))
print("percentage of nan for each column: "+str(nan/csv_count))

csv_count for mentions: 2974
total number of lines for mentions: 14485980
percentage of nan for each column: GlobalEventID        0.000000e+00
EventTimeDate        0.000000e+00
MentionTimeDate      0.000000e+00
MentionType          0.000000e+00
MentionSourceName    8.460116e-07
MentionIdentifier    0.000000e+00
SentenceID           0.000000e+00
Actor1CharOffset     0.000000e+00
Actor2CharOffset     0.000000e+00
ActionCharOffset     0.000000e+00
InRawText            0.000000e+00
Confidence           0.000000e+00
MentionDocLen        0.000000e+00
MentionDocTone       0.000000e+00
SRCLC                1.000000e+00
ENG                  1.000000e+00
dtype: float64


In [43]:
csv_count = 0
total = 0
nan = 0

for root, _, filenames in os.walk("gdelt_data_event"):
    for filename in filenames:
        if filename.endswith(".CSV"):
            csv_count += 1
            temp = pd.read_csv(os.path.join(root, filename), delimiter='\t', names=col_names_events) 
            total+=len(temp)
            nan += temp.isna().mean()


print("csv_count for events: "+str(csv_count))
print("total number of lines for events: "+str(total))
print("percentage of nan for each column: "+str(nan/csv_count))

csv_count for events: 2974
total number of lines for events: 4606957
percentage of nan for each column: GlobalEventID          0.000000
Day                    0.000000
MonthYear              0.000000
Year                   0.000000
FractionDate           0.000000
                         ...   
ActionGeo_Lat          0.024224
ActionGeo_Long         0.024117
ActionGeo_FeatureID    0.023964
DATEADDED              0.000000
SOURCEURL              0.000000
Length: 61, dtype: float64


In [4]:
value_count = 0
csv_count = 0

for root, _, filenames in os.walk("gdelt_data"):
    for filename in filenames:
        if filename.endswith(".CSV"):
            csv_count += 1
            temp = pd.read_csv(os.path.join(root, filename), delimiter='\t', names=col_names_mentions)
            value_count += temp["MentionType"].value_counts()
            if len(temp["MentionType"].value_counts())>1:
                print("omg")

            
print(value_count)


MentionType
1    14485980
Name: count, dtype: int64


## PreProcessing -> put in graph format