In [1]:
import pandas as pd
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
pd.set_option('display.max_colwidth', None)
dataset_filepath = '../../dataset/dataset.csv'
dataset = pd.read_csv(dataset_filepath)


In [2]:
# Figs. 2 and 3
All = dataset

All_s = All[['statement']]

# Step 1: Load sentences
sentences_All = All_s['statement'].tolist()
# Preprocessing steps here, e.g., tokenization, stopword removal, etc.

# Step 2: Convert sentences to transactions
transactions_All = []
for sentence in sentences_All:
    transaction_All = str(sentence).split()  # Split sentence into items (words)
    transactions_All.append(transaction_All)

# Step 3: Create transaction dataset
te_All = TransactionEncoder()
te_ary_All = te_All.fit_transform(transactions_All)
transaction_df_All = pd.DataFrame(te_ary_All, columns=te_All.columns_)

In [3]:
# Support guide calculation
sentences = transactions_All
word1 = "kardashian"
word2 = "kim"

count = 0
for sentence in sentences:
    if word1 in sentence and word2 in sentence:
        count += 1

print(f"The words '{word1}' and '{word2}' appear together in {count} sentences.")


The words 'kardashian' and 'kim' appear together in 507 sentences.


In [4]:
print(transaction_df_All.shape)

(35987, 27884)


In [5]:
frequent_itemsets_All = apriori(transaction_df_All, min_support=0.004, use_colnames=True)

In [6]:
print(frequent_itemsets_All.shape)

(482, 2)


In [7]:
rules_All = association_rules(frequent_itemsets_All, metric="confidence", min_threshold=0.9)

In [8]:
selected_columns = ['antecedents', 'consequents', 'support', 'confidence']
print(rules_All.loc[:, selected_columns])

                antecedents       consequents   support  confidence
0                   (jolie)        (angelina)  0.007892    0.965986
1                 (aniston)        (jennifer)  0.007864    0.965870
2                  (bieber)          (justin)  0.007419    0.978022
3                 (shelton)           (blake)  0.005307    0.989637
4                    (pitt)            (brad)  0.010976    0.994962
5                  (carpet)             (red)  0.005780    0.985782
6                  (disick)           (scott)  0.004085    0.960784
7                   (gomez)          (selena)  0.010198    0.991892
8                 (stefani)            (gwen)  0.004752    0.994186
9                   (lopez)        (jennifer)  0.004641    0.954286
10                   (katy)           (perry)  0.004113    0.902439
11                 (markle)          (meghan)  0.010754    0.992308
12                   (york)             (new)  0.004585    1.000000
13                 (united)          (states)  0

In [9]:
antecedents_set_All = set(rules_All['antecedents'].tolist())
consequents_set_All = set(rules_All['consequents'].tolist())
combined_set_All = antecedents_set_All | consequents_set_All

words_All = [list(word_set_All)[0] for word_set_All in combined_set_All]
unique_words_All = set(words_All)
print(unique_words_All)

{'united', 'stefani', 'meghan', 'blake', 'york', 'gwen', 'new', 'selena', 'gomez', 'shelton', 'jennifer', 'red', 'brad', 'carpet', 'swift', 'kim', 'justin', 'bieber', 'prince', 'states', 'disick', 'jolie', 'taylor', 'pitt', 'west', 'lopez', 'markle', 'perry', 'angelina', 'scott', 'katy', 'harry', 'aniston'}


In [10]:
# Sort the rules based on support in descending order
sorted_rules = rules_All.sort_values('confidence', ascending=False)

# Print the top 10 rules with highest support
print(sorted_rules.head(20))

                antecedents       consequents  antecedent support   
21          (justin, gomez)          (selena)            0.004807  \
12                   (york)             (new)            0.004585   
4                    (pitt)            (brad)            0.011032   
22          (harry, markle)          (meghan)            0.005419   
31  (harry, prince, markle)          (meghan)            0.005280   
16         (pitt, angelina)            (brad)            0.005224   
8                 (stefani)            (gwen)            0.004780   
20            (pitt, jolie)            (brad)            0.004252   
18            (pitt, jolie)        (angelina)            0.004252   
27  (pitt, angelina, jolie)            (brad)            0.004224   
26      (pitt, brad, jolie)        (angelina)            0.004224   
11                 (markle)          (meghan)            0.010837   
7                   (gomez)          (selena)            0.010281   
25         (prince, markle)       

In [11]:
# Set max_rows to display all the results
pd.options.display.max_rows = len(rules_All)

# Print the association rules dataframe
print(rules_All)


                antecedents       consequents  antecedent support   
0                   (jolie)        (angelina)            0.008170  \
1                 (aniston)        (jennifer)            0.008142   
2                  (bieber)          (justin)            0.007586   
3                 (shelton)           (blake)            0.005363   
4                    (pitt)            (brad)            0.011032   
5                  (carpet)             (red)            0.005863   
6                  (disick)           (scott)            0.004252   
7                   (gomez)          (selena)            0.010281   
8                 (stefani)            (gwen)            0.004780   
9                   (lopez)        (jennifer)            0.004863   
10                   (katy)           (perry)            0.004557   
11                 (markle)          (meghan)            0.010837   
12                   (york)             (new)            0.004585   
13                 (united)       

In [12]:
# Fig. 4 and Table 4
# Example dataset of news articles with keywords

onlyreal_l = dataset[dataset['label'] == 1]
onlyreal_s = onlyreal_l[['statement']]

# Step 1: Load sentences
sentences = onlyreal_s['statement'].tolist()
# Preprocessing steps here, e.g., tokenization, stopword removal, etc.

# Step 2: Convert sentences to transactions
transactions = []
for sentence in sentences:
    transaction = str(sentence).split()  # Split sentence into items (words)
    transactions.append(transaction)

# Step 3: Create transaction dataset
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
transaction_df = pd.DataFrame(te_ary, columns=te.columns_)

In [13]:
print(transaction_df.shape)

(24575, 23390)


In [14]:
frequent_itemsets = apriori(transaction_df, min_support=0.004, use_colnames=True)
# 0.004 ~ occur at least 100 (exacly 98.3) times out of a total of 24575 transactions

In [15]:
print(frequent_itemsets.shape)

(421, 2)


In [16]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
selected_columns = ['antecedents', 'consequents', 'support', 'confidence']
print(rules.loc[:, selected_columns])

                antecedents       consequents   support  confidence
0                  (carpet)             (red)  0.008057    0.994975
1                   (gomez)          (selena)  0.006307    0.993590
2              (housewives)            (real)  0.004924    0.952756
3                  (markle)          (meghan)  0.009237    0.991266
4                    (york)             (new)  0.005005    1.000000
5                  (united)          (states)  0.009766    0.952381
6                   (swift)          (taylor)  0.006633    0.964497
7           (harry, markle)          (meghan)  0.004720    1.000000
8           (harry, markle)          (prince)  0.004639    0.982759
9           (harry, meghan)          (prince)  0.005738    0.940000
10         (prince, markle)          (meghan)  0.005249    0.992308
11  (harry, meghan, markle)          (prince)  0.004639    0.982759
12  (harry, prince, markle)          (meghan)  0.004639    1.000000
13          (harry, markle)  (meghan, prince)  0

In [17]:
# Set max_rows to display all the results
pd.options.display.max_rows = len(rules)

# Print the association rules dataframe
print(rules)

                antecedents       consequents  antecedent support   
0                  (carpet)             (red)            0.008098  \
1                   (gomez)          (selena)            0.006348   
2              (housewives)            (real)            0.005168   
3                  (markle)          (meghan)            0.009318   
4                    (york)             (new)            0.005005   
5                  (united)          (states)            0.010254   
6                   (swift)          (taylor)            0.006877   
7           (harry, markle)          (meghan)            0.004720   
8           (harry, markle)          (prince)            0.004720   
9           (harry, meghan)          (prince)            0.006104   
10         (prince, markle)          (meghan)            0.005290   
11  (harry, meghan, markle)          (prince)            0.004720   
12  (harry, prince, markle)          (meghan)            0.004639   
13          (harry, markle)  (megh

In [18]:
antecedents_set = set(rules['antecedents'].tolist())
consequents_set = set(rules['consequents'].tolist())
combined_set = antecedents_set | consequents_set

words = [list(word_set)[0] for word_set in combined_set]
unique_words = set(words)
print(unique_words)

{'york', 'meghan', 'prince', 'housewives', 'new', 'states', 'united', 'selena', 'taylor', 'real', 'gomez', 'red', 'carpet', 'harry', 'swift', 'markle'}


In [19]:
# Sort the rules based on support in descending order
sorted_rules = rules.sort_values('confidence', ascending=False)

# Print the top 10 rules with highest support
print(sorted_rules.head(20))


                antecedents       consequents  antecedent support   
4                    (york)             (new)            0.005005  \
7           (harry, markle)          (meghan)            0.004720   
12  (harry, prince, markle)          (meghan)            0.004639   
0                  (carpet)             (red)            0.008098   
1                   (gomez)          (selena)            0.006348   
10         (prince, markle)          (meghan)            0.005290   
3                  (markle)          (meghan)            0.009318   
8           (harry, markle)          (prince)            0.004720   
11  (harry, meghan, markle)          (prince)            0.004720   
13          (harry, markle)  (meghan, prince)            0.004720   
6                   (swift)          (taylor)            0.006877   
2              (housewives)            (real)            0.005168   
5                  (united)          (states)            0.010254   
9           (harry, meghan)       

In [20]:
# Fig. 5

onlyfake_l = dataset[dataset['label'] == 0]
onlyfake_n = onlyfake_l[['statement']]

# Step 1: Load sentences
sentences_n = onlyfake_n['statement'].tolist()
# Preprocessing steps here, e.g., tokenization, stopword removal, etc.

# Step 2: Convert sentences to transactions
transactions_n = []
for sentence_n in sentences_n:
    transaction_n = str(sentence_n).split()  # Split sentence into items (words)
    transactions_n.append(transaction_n)

# Step 3: Create transaction dataset
te_n = TransactionEncoder()
te_ary_n = te_n.fit_transform(transactions_n)
transaction_df_n = pd.DataFrame(te_ary_n, columns=te_n.columns_)

In [22]:
frequent_itemsets_n = apriori(transaction_df_n, min_support=0.002, use_colnames=True)

In [23]:
rules_n = association_rules(frequent_itemsets_n, metric="confidence", min_threshold=0.9)
selected_columns = ['antecedents', 'consequents', 'support', 'confidence']
rules_n.to_csv('fake_news_rules.csv', index=False)
print(rules_n.loc[:, selected_columns])

                          antecedents                 consequents   support   
0                           (affleck)                       (ben)  0.007624  \
1                         (rodriguez)                      (alex)  0.004907   
2                             (jolie)                  (angelina)  0.020592   
3                            (jolies)                  (angelina)  0.003593   
4                           (aniston)                  (jennifer)  0.020680   
..                                ...                         ...       ...   
480  (jennifer, brad, jolie, aniston)            (pitt, angelina)  0.002103   
481      (pitt, brad, jolie, aniston)        (jennifer, angelina)  0.002103   
482           (jennifer, jolie, pitt)   (brad, angelina, aniston)  0.002103   
483            (pitt, jolie, aniston)  (jennifer, brad, angelina)  0.002103   
484            (brad, jolie, aniston)  (jennifer, angelina, pitt)  0.002103   

     confidence  
0      0.966667  
1      0.982456

In [24]:
antecedents_set_n = set(rules_n['antecedents'].tolist())
consequents_set_n = set(rules_n['consequents'].tolist())
combined_set_n = antecedents_set_n | consequents_set_n

words_n = [list(word_set_n)[0] for word_set_n in combined_set_n]
unique_words_n = set(words_n)
print(unique_words_n)

{'lady', 'kendall', 'robbie', 'holmes', 'jenner', 'travis', 'blake', 'kanye', 'obamas', 'care', 'markles', 'plan', 'gomez', 'health', 'swift', 'government', 'justin', 'lawrence', 'pratt', 'garner', 'wedding', 'miranda', 'ellen', 'markle', 'richie', 'royal', 'kids', 'weeknd', 'chris', 'walker', 'court', 'planned', 'bendjima', 'kourtney', 'styles', 'neri', 'united', 'foxx', 'degeneres', 'khloe', 'biebers', 'spears', 'divorce', 'miley', 'rubio', 'split', 'affleck', 'selena', 'stewart', 'hemsworth', 'pattinson', 'kidman', 'jamie', 'kim', 'sofia', 'jersey', 'bieber', 'parenthood', 'jolies', 'mariah', 'supreme', 'pitt', 'perry', 'de', 'rossi', 'gaga', 'aniston', 'cyrus', 'affordable', 'portia', 'kristen', 'meghan', 'law', 'wests', 'york', 'kylie', 'younes', 'jennifer', 'cruise', 'brad', 'kardashian', 'keith', 'prince', 'states', 'jolie', 'taylor', 'kate', 'marco', 'lambert', 'west', 'act', 'alex', 'britney', 'angelina', 'custody', 'thompson', 'scott', 'pitts', 'katy', 'harry', 'theroux', 'ca

In [26]:
# Sort the rules based on support in descending order
sorted_rules = rules_n.sort_values('confidence', ascending=False)

# Print the top 10 rules with highest support
print(sorted_rules.head(20))

                 antecedents   consequents  antecedent support   
378    (holmes, jamie, foxx)       (katie)            0.006835  \
339      (portia, degeneres)   (ellen, de)            0.002278   
166                  (rossi)  (portia, de)            0.002541   
167      (portia, degeneres)       (ellen)            0.002278   
168       (degeneres, rossi)       (ellen)            0.002191   
..                       ...           ...                 ...   
338   (ellen, degeneres, de)      (portia)            0.002278   
203         (harry, wedding)      (prince)            0.002454   
181           (ellen, rossi)      (portia)            0.002454   
337  (portia, de, degeneres)       (ellen)            0.002278   
183           (holmes, foxx)       (jamie)            0.006835   

     consequent support   support  confidence        lift  leverage   
378            0.011129  0.006835         1.0   89.858268  0.006759  \
339            0.002541  0.002278         1.0  393.517241  0.0022