In [1]:
import pandas as pd

# Load Liar dataset
liar_cols = ["id", "label", "statement"]
liar_path_test = "test.tsv"
liar_df_test = pd.read_csv(liar_path_test, sep='\t', header=None, usecols=[0, 1, 2], names=liar_cols)
liar_path_train = "train.tsv"
liar_df_train = pd.read_csv(liar_path_train, sep='\t', header=None, usecols=[0, 1, 2], names=liar_cols)
liar_path_valid = "valid.tsv"
liar_df_valid = pd.read_csv(liar_path_valid, sep='\t', header=None, usecols=[0, 1, 2], names=liar_cols)

In [2]:
# Load FakeNewsNet dataset
#fnn_cols = ["id","news_url", "title", "tweet_id"]
fnn_cols = ["id","statement",]

fnn_path_gossip_fake = "gossipcop_fake.csv"
fnn_df_gossip_fake = pd.read_csv(fnn_path_gossip_fake, usecols=[0, 2], names=fnn_cols, header = 0)
fnn_df_gossip_fake["label"] = "false"

fnn_path_gossip_real = "gossipcop_real.csv"
fnn_df_gossip_real = pd.read_csv(fnn_path_gossip_real, usecols=[0,2], names=fnn_cols,  header = 0)
fnn_df_gossip_real["label"] = "true"

fnn_path_polit_fake = "politifact_fake.csv"
fnn_df_polit_fake = pd.read_csv(fnn_path_polit_fake, usecols=[0, 2], names=fnn_cols,  header = 0)
fnn_df_polit_fake["label"] = "false"

fnn_path_polit_real = "politifact_real.csv"
fnn_df_polit_real = pd.read_csv(fnn_path_polit_real, usecols=[0, 2], names=fnn_cols,  header = 0)
fnn_df_polit_real["label"] = "true"
combined_liar = pd.concat([liar_df_test, liar_df_train, liar_df_valid])
# Removing rows with missing values
combined_liar.dropna(inplace=True)
# Converting labels to numerical values
#I hate this! We lose so much information!
combined_liar['label'] = combined_liar['label'].map({'pants-fire': 0, 'false': 0, 'barely-true': 0, 'half-true': 1, 'mostly-true': 1, 'true': 1})


In [3]:
combined_fnn  = pd.concat([fnn_df_gossip_fake, fnn_df_gossip_real, fnn_df_polit_fake, fnn_df_polit_real])
# Removing rows with missing values
combined_fnn.dropna(inplace=True)
# Converting labels to numerical values
combined_fnn['label'] = combined_fnn['label'].map({'false': 0, 'true': 1})

In [4]:
combined_liar_fnn =pd.concat([combined_liar[["statement","label"]], combined_fnn])

In [5]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define a function to preprocess text data
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation+'-–‘’“”' ))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(filtered_tokens)
    
    return text

# Preprocess the text data
combined_liar_fnn['statement'] = combined_liar_fnn['statement'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd

# Example dataset of news articles with keywords
dataset = combined_liar_fnn
dataset = dataset[['statement','label']]

In [7]:
#%%%%%%%%%%% All data %%%%%%%%%%%
# Figs. 2 and 3
All = dataset

All_s = All[['statement']]

# Step 1: Load sentences
sentences_All = All_s['statement'].tolist()
# Preprocessing steps here, e.g., tokenization, stopword removal, etc.

# Step 2: Convert sentences to transactions
transactions_All = []
for sentence in sentences_All:
    transaction_All = sentence.split()  # Split sentence into items (words)
    transactions_All.append(transaction_All)

# Step 3: Create transaction dataset
te_All = TransactionEncoder()
te_ary_All = te_All.fit_transform(transactions_All)
transaction_df_All = pd.DataFrame(te_ary_All, columns=te_All.columns_)

In [8]:
# Support guide calculation
sentences = transactions_All
word1 = "kardashian"
word2 = "kim"

count = 0
for sentence in sentences:
    if word1 in sentence and word2 in sentence:
        count += 1

print(f"The words '{word1}' and '{word2}' appear together in {count} sentences.")


The words 'kardashian' and 'kim' appear together in 419 sentences.


In [9]:
print(transaction_df_All.shape)

(30554, 25958)


In [10]:
#%% Step 4: Apply Apriori algorithm
frequent_itemsets_All = apriori(transaction_df_All, min_support=0.004, use_colnames=True)

In [11]:
print(frequent_itemsets_All.shape)

(501, 2)


In [12]:
rules_All = association_rules(frequent_itemsets_All, metric="confidence", min_threshold=0.9)

In [None]:
selected_columns = ['antecedents', 'consequents', 'support', 'confidence']
print(rules_All.loc[:, selected_columns])

In [14]:
antecedents_set_All = set(rules_All['antecedents'].tolist())
consequents_set_All = set(rules_All['consequents'].tolist())
combined_set_All = antecedents_set_All | consequents_set_All

words_All = [list(word_set_All)[0] for word_set_All in combined_set_All]
unique_words_All = set(words_All)
print(unique_words_All)

{'carpet', 'new', 'pitt', 'theroux', 'gomez', 'taylor', 'gwen', 'stefani', 'states', 'jolie', 'brad', 'katie', 'swift', 'united', 'justin', 'aniston', 'blake', 'bieber', 'markle', 'york', 'jennifer', 'selena', 'prince', 'meghan', 'holmes', 'red', 'shelton', 'lopez', 'angelina'}


In [15]:
# Sort the rules based on support in descending order
sorted_rules = rules_All.sort_values('confidence', ascending=False)

# Print the top 10 rules with highest support
print(sorted_rules.head(20))

                antecedents       consequents  antecedent support  \
12                   (york)             (new)            0.004615   
10                (theroux)          (justin)            0.004058   
22         (stefani, blake)            (gwen)            0.004353   
20          (gwen, shelton)           (blake)            0.004026   
24          (gomez, justin)          (selena)            0.005237   
4                    (pitt)            (brad)            0.012568   
16         (pitt, angelina)            (brad)            0.005924   
3                 (shelton)           (blake)            0.005728   
27         (markle, prince)          (meghan)            0.005531   
6                   (gomez)          (selena)            0.010506   
7                 (stefani)            (gwen)            0.005204   
11                 (markle)          (meghan)            0.010146   
25          (markle, harry)          (meghan)            0.005073   
32  (markle, harry, prince)       

In [None]:
# Set max_rows to display all the results
pd.options.display.max_rows = len(rules_All)

# Print the association rules dataframe
print(rules_All)

In [None]:
#%%%%%%%%%%% Get only real news %%%%%%%%%%%
# Fig. 4 and Table 4
# Example dataset of news articles with keywords
dataset = combined_liar_fnn
dataset = dataset[['statement','label']]

onlyreal_l = dataset[dataset['label'] == 1]
onlyreal_s = onlyreal_l[['statement']]

# Step 1: Load sentences
sentences = onlyreal_s['statement'].tolist()
# Preprocessing steps here, e.g., tokenization, stopword removal, etc.

# Step 2: Convert sentences to transactions
transactions = []
for sentence in sentences:
    transaction = sentence.split()  # Split sentence into items (words)
    transactions.append(transaction)

# Step 3: Create transaction dataset
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
print(transaction_df.shape)

(24575, 23389)


In [None]:
#%% Step 4: Apply Apriori algorithm
frequent_itemsets = apriori(transaction_df, min_support=0.004, use_colnames=True)
# 0.004 ~ occur at least 100 (exacly 98.3) times out of a total of 24575 transactions

In [None]:
print(frequent_itemsets.shape)

(421, 2)


In [None]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
selected_columns = ['antecedents', 'consequents', 'support', 'confidence']
print(rules.loc[:, selected_columns])

                antecedents       consequents   support  confidence
0                  (carpet)             (red)  0.008057    0.994975
1                   (gomez)          (selena)  0.006307    0.993590
2              (housewives)            (real)  0.004924    0.952756
3                  (markle)          (meghan)  0.009237    0.991266
4                    (york)             (new)  0.005005    1.000000
5                  (united)          (states)  0.009766    0.952381
6                   (swift)          (taylor)  0.006633    0.964497
7           (harry, markle)          (meghan)  0.004720    1.000000
8           (harry, markle)          (prince)  0.004639    0.982759
9           (harry, meghan)          (prince)  0.005738    0.940000
10         (markle, prince)          (meghan)  0.005249    0.992308
11  (harry, markle, meghan)          (prince)  0.004639    0.982759
12  (harry, markle, prince)          (meghan)  0.004639    1.000000
13          (harry, markle)  (meghan, prince)  0

In [None]:
# Set max_rows to display all the results
pd.options.display.max_rows = len(rules)

# Print the association rules dataframe
print(rules)

In [None]:
antecedents_set = set(rules['antecedents'].tolist())
consequents_set = set(rules['consequents'].tolist())
combined_set = antecedents_set | consequents_set

words = [list(word_set)[0] for word_set in combined_set]
unique_words = set(words)
print(unique_words)

{'united', 'gomez', 'real', 'harry', 'prince', 'taylor', 'carpet', 'selena', 'meghan', 'new', 'states', 'swift', 'york', 'red', 'markle', 'housewives'}


In [None]:
# Sort the rules based on support in descending order
sorted_rules = rules.sort_values('confidence', ascending=False)

# Print the top 10 rules with highest support
print(sorted_rules.head(20))

In [None]:
#%%%%%%%%%%% Get only fake news %%%%%%%%%%%
# Fig. 5

onlyfake_l = dataset[dataset['label'] == 0]
onlyfake_n = onlyfake_l[['statement']]

# Step 1: Load sentences
sentences_n = onlyfake_n['statement'].tolist()
# Preprocessing steps here, e.g., tokenization, stopword removal, etc.

# Step 2: Convert sentences to transactions
transactions_n = []
for sentence_n in sentences_n:
    transaction_n = sentence_n.split()  # Split sentence into items (words)
    transactions_n.append(transaction_n)

# Step 3: Create transaction dataset
te_n = TransactionEncoder()
te_ary_n = te_n.fit_transform(transactions_n)
transaction_df_n = pd.DataFrame(te_ary_n, columns=te_n.columns_)

In [None]:
frequent_itemsets_n = apriori(transaction_df_n, min_support=0.002, use_colnames=True)

In [None]:
print(frequent_itemsets_n.shape)

(1576, 2)


In [None]:
rules_n = association_rules(frequent_itemsets_n, metric="confidence", min_threshold=0.9)
selected_columns = ['antecedents', 'consequents', 'support', 'confidence']
print(rules_n.loc[:, selected_columns])

In [None]:
# Set max_rows to display all the results
pd.options.display.max_rows = len(rules_n)

# Print the association rules dataframe
print(rules_n)

In [None]:
antecedents_set_n = set(rules_n['antecedents'].tolist())
consequents_set_n = set(rules_n['consequents'].tolist())
combined_set_n = antecedents_set_n | consequents_set_n

words_n = [list(word_set_n)[0] for word_set_n in combined_set_n]
unique_words_n = set(words_n)
print(unique_words_n)

{'nicole', 'jennifer', 'garner', 'gwen', 'scott', 'justin', 'jenner', 'kate', 'disick', 'health', 'ben', 'kendall', 'bieber', 'jamie', 'travis', 'pitt', 'stefani', 'selena', 'affleck', 'kidman', 'gomez', 'cyrus', 'holmes', 'swift', 'katie', 'harry', 'perry', 'kanye', 'meghan', 'robert', 'supreme', 'court', 'stewart', 'care', 'angelina', 'middleton', 'kim', 'united', 'lopez', 'jolie', 'kristen', 'aniston', 'states', 'walker', 'blake', 'foxx', 'west', 'prince', 'rodriguez', 'katy', 'brad', 'markle', 'theroux', 'shelton', 'pattinson', 'law', 'miley', 'kylie', 'taylor', 'kris', 'alex'}


In [None]:
# Sort the rules based on support in descending order
sorted_rules = rules_n.sort_values('confidence', ascending=False)

# Print the top 10 rules with highest support
print(sorted_rules.head(20))