# User Clustering

This notebook is used to cluster users based on some vector representation. Additionaly can can calculate clusters based on temporal splits.

To run the calcutation cell the notebook requires a pandas dataframe of the following structure:

| user_id | fn_news_spreader | pb_factor | factual_factor | vector1 | vector2 | cluster
|-|-|-|-|-|-|-|
USER_ID1| 0 | 1.21 | 2.34 | [0.2,...,-0.1] | [0.6,...,0.7] | 1
USER_ID2| 1 | -0.34 | 1.79 | [0.7,...,0.5] | [-0.1,...,0.0] | 0
... | ... | ... | ... | ... | ... | ...

Where fn_spreader is a binary variable, pol_bias is some float between [-3, 3] and the vector is a normalized vector of some dimension. The datafram without vectors and no clustering can be found [in this google drive](https://drive.google.com/file/d/1FbkQn2d9LiJ54ZrxiLBAYJd_rCAtn0QM/view?usp=sharing). There are some cells below showing how you can add your own vectors.

The clustering will append an additional column to the dataframe containing the cluster label. The function _score_clustering(dataframe)_ will we return a score based on inter cluster similarity. The whole clustering gets a score based on the weighted mean of its clusters.

In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.cluster import KMeans
from tqdm import tqdm

In [None]:
df = pd.read_csv('../data/blank_user_frame.csv', index_col=0)
df = df.fillna(0)
df

## Adding named entity vectors

In [None]:
with open('../data/named_entities.pickel', 'rb') as f:
    user_entities = pickle.load(f)

named_entities = ['ORG', 'PERSON', 'DATE', 'GPE','CARDINAL', 'NORP',
                  'PERCENT', 'MONEY', 'ORDINAL', 'WORK_OF_ART', 'LOC',
                 'TIME', 'LAW', 'PRODUCT', 'FAC', 'EVENT', 'QUANTITY',
                 'LANGUAGE']

vectors = []
for user in tqdm(df['user_id'], total=len(df)):
    
    ents = user_entities[user]['all']
    
    # normalization
    N = sum(ents.values())
    
    # preparing vector; 0 as default value
    vector = [0]*len(named_entities)
    
    for ind, entity in enumerate(named_entities):
        if entity in ents:
            vector[ind] = ents[entity]/N
    vectors.append(np.array(vector))

df['named_entities'] = vectors

## Adding Linguistic Features

In [None]:
with open('../data/linguistic_features.pickel', 'rb') as f:
    ling_features = pickle.load(f)
    
features = ['DET', 'NOUN', 'SCONJ', 'AUX', 'PART', 'VERB', 'PRON', 'ADJ', 'PUNCT',
            'ADP', 'PROPN', 'NUM', 'CCONJ', 'ADV', 'SPACE', 'SYM', 'INTJ', 'X']

vectors = []
for user in tqdm(df['user_id'], total=len(df)):
    
    feat = ling_features[user]['all']
    
    # normalization
    N = sum(feat.values())
    
    # preparing vector; 0 as default value
    vector = [0]*len(features)
    
    for ind, f in enumerate(features):
        if f in feat:
            vector[ind] = feat[f]/N
    vectors.append(np.array(vector))

df['linguistic_features'] = vectors

## Adding sBert embeddings - by Ezzeddine

In [None]:
with open('../data/user_embedding_basic_sbert.p', 'rb') as f:
    embeddings = pickle.load(f)

vectors = []
for user in tqdm(df['user_id'], total=len(df)):
    
    if user not in embeddings:
        vectors.append([0]*768)
    else:
        vectors.append(np.array(embeddings[user]))

df['sBert'] = vectors

## Clustering evaluation

In [None]:
def inter_similarity(df):
    N = len(df)
    
    fn_similatity = 0
    
    stats = []
    
    
    
    for c in df['cluster'].unique():
        
        temp = df[df['cluster'] == c]
        
        fn_amount = sum(temp['fake_news_spreader'])
        cluster_size = len(temp)
        
        # more fn spreaders than rn spreaders
        if fn_amount > 0.5*cluster_size:
            score = 2 * (cluster_size/N) * (fn_amount/cluster_size) - (cluster_size/N)
            fn = 1
        
        # more rn than fn
        else:
            score = 2 * (cluster_size/N) * ((cluster_size - fn_amount)/cluster_size) - (cluster_size/N)
            fn = 0
            
        fn_similatity += score
        
        stats.append((cluster_size, score*(N/cluster_size)*100, fn))
                    
    
    print('Inter-Cluster similarity in fn_news_spreader is {0:.2f}%'.format(fn_similatity*100),
          'with {} clusters'.format(len(df['cluster'].unique())))
    
    s = ''
    for stat in sorted(stats):
        s += colored(round(255-(stat[1])*2.55), round(stat[1]*2.55), 0, str(stat[0])) + ' '
    print('Size distribution:', s)
    
    best = sorted(stats)[0]
    print('Best | Size:', best[0], 'Score:', '{0:.2f}%'.format(best[1]), 'Fn?', 'Yes' if best[2] == 1 else 'No')
    
    worst = sorted(stats)[-1]
    print('Worst | Size:', worst[0], 'Score:', '{0:.2f}%'.format(worst[1]), 'Fn?', 'Yes' if worst[2] == 1 else 'No')
    
    print('')
    

In [None]:
# Randome, a baseline
from random import randint

for i in range(2, 9):
    df['cluster'] = [randint(0,i) for _ in range(len(df))]
    inter_similarity(df)

In [None]:
# Based on named_entities

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['named_entities']))
    df['cluster'] = clustering.labels_
    inter_similarity(df)

In [None]:
# Based on linguistic features

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['linguistic_features']))
    df['cluster'] = clustering.labels_
    inter_similarity(df)

In [None]:
# Based on linguistic features and named_entities

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit([list(i[1]['named_entities'])
                                                           + list(i[1]['linguistic_features'])
                                                           for i in df.iterrows()])
    
    df['cluster'] = clustering.labels_
    inter_similarity(df)

In [None]:
# Based on sBert

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['sBert']))
    df['cluster'] = clustering.labels_
    inter_similarity(df)

In [None]:
# Based on pol_bias and fact_factor

for i in range(2, 20):
    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df.iloc[:,[3,4]].to_numpy()))
    df['cluster'] = clustering.labels_
    inter_similarity(df)