# User Clustering

This notebook is used to cluster users based on some vector representation. Additionaly can can calculate clusters based on temporal splits.

To run the calcutation cell the notebook requires a pandas dataframe of the following structure:

| user_id | fn_news_spreader | pb_factor | factual_factor | vector1 | vector2 | cluster
|-|-|-|-|-|-|-|
USER_ID1| 0 | 1.21 | 2.34 | [0.2,...,-0.1] | [0.6,...,0.7] | 1
USER_ID2| 1 | -0.34 | 1.79 | [0.7,...,0.5] | [-0.1,...,0.0] | 0
... | ... | ... | ... | ... | ... | ...

Where fn_spreader is a binary variable, pol_bias is some float between [-3, 3] and the vector is a normalized vector of some dimension. The datafram without vectors and no clustering can be found [in this google drive](https://drive.google.com/file/d/1FbkQn2d9LiJ54ZrxiLBAYJd_rCAtn0QM/view?usp=sharing). There are some cells below showing how you can add your own vectors.

The clustering will append an additional column to the dataframe containing the cluster label. The function _score_clustering(dataframe)_ will we return a score based on inter cluster similarity. The whole clustering gets a score based on the weighted mean of its clusters.

In [None]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from tqdm import tqdm
from glob import glob

In [None]:
df = pd.read_csv('../data/blank_user_frame.csv')
df = df.fillna(0)
df

## Adding named entity vectors

In [None]:
with open('../data/named_entities.pickel', 'rb') as f:
    user_entities = pickle.load(f)

named_entities = ['ORG', 'PERSON', 'DATE', 'GPE','CARDINAL', 'NORP',
                  'PERCENT', 'MONEY', 'ORDINAL', 'WORK_OF_ART', 'LOC',
                 'TIME', 'LAW', 'PRODUCT', 'FAC', 'EVENT', 'QUANTITY',
                 'LANGUAGE']

vectors = []
for user in tqdm(df['user_id'], total=len(df)):
    
    ents = user_entities[user]['all']
    
    # normalization
    N = sum(ents.values())
    
    # preparing vector; 0 as default value
    vector = [0]*len(named_entities)
    
    for ind, entity in enumerate(named_entities):
        if entity in ents:
            vector[ind] = ents[entity]/N
    vectors.append(np.array(vector))

df['named_entities'] = vectors

## Adding Linguistic Features

In [None]:
with open('../data/linguistic_features.pickel', 'rb') as f:
    ling_features = pickle.load(f)
    
features = ['DET', 'NOUN', 'SCONJ', 'AUX', 'PART', 'VERB', 'PRON', 'ADJ', 'PUNCT',
            'ADP', 'PROPN', 'NUM', 'CCONJ', 'ADV', 'SPACE', 'SYM', 'INTJ', 'X']

vectors = []
for user in tqdm(df['user_id'], total=len(df)):
    
    feat = ling_features[user]['all']
    
    # normalization
    N = sum(feat.values())
    
    # preparing vector; 0 as default value
    vector = [0]*len(features)
    
    for ind, f in enumerate(features):
        if f in feat:
            vector[ind] = feat[f]/N
    vectors.append(np.array(vector))

df['linguistic_features'] = vectors

## Adding sBert embeddings - by Ezzeddine

In [None]:
with open('../data/user_embedding_basic_sbert.p', 'rb') as f:
    embeddings = pickle.load(f)

vectors = []
for user in tqdm(df['user_id'], total=len(df)):
    
    if user not in embeddings:
        vectors.append([0]*768)
    else:
        vectors.append(np.array(embeddings[user]))

df['sBert'] = vectors

## Clustering evaluation

In [None]:
def format_stats(stats):
    return "\033[38;2;{};{};{};{}m{}: {}\033[38;2;0;0;0;0m".format(round(255-(stats[1])*2.55), round(stats[1]*2.55), 0, 4 if stats[2] == 1 else 2, str(stats[3]), str(stats[0]))

def inter_similarity(frame, col):
    
    if 'mask' in frame.columns:
        df = frame[frame['mask']]
    else:
        df = frame
    N = len(df)
    
    fn_similatity = 0
    pb_var = 0
    fact_var = 0
    
    stats = []
    
    for c in df[col].unique():
        
        temp = df[df[col] == c]
        cluster_size = len(temp)
        
        pb_var += (cluster_size/N) * np.var(temp['pb_factor'])
        fact_var += (cluster_size/N) * np.var(temp['factual_factor'])
        
        # fn_amount = sum(temp['fake_news_spreader'])
        fn_amount = sum(temp['rebalanced'])
        
        # more fn spreaders than rn spreaders
        if fn_amount > 0.5*cluster_size:
            score = (cluster_size/N) * (fn_amount/cluster_size)
            fn = 1
        
        # more rn than fn
        else:
            score = (cluster_size/N) * ((cluster_size - fn_amount)/cluster_size)
            fn = 0
            
        fn_similatity += score
        
        stats.append((cluster_size, score*(N/cluster_size)*100, fn, c))
                    
    
    print('fn_news_spreader: {0:.2f}%'.format(fn_similatity*100),
          'with {} clusters'.format(len(df[col].unique())))
    
    print('political bias variance: {:.3f}'.format(pb_var))
    print('factual factor variance: {:.3f}'.format(fact_var))
    
    stats = sorted(stats, key=lambda x: x[1])
    
    s = ''
    for stat in stats:
        s += format_stats(stat) + ' '
    print('Size distribution:', s)
    
    print('')
    
    return fn_similatity, pb_var, fact_var, stats

In [None]:
# Random, a baseline
from random import randint

for i in range(2, 9):
    df['RDcluster' + str(i)] = [randint(0,i) for _ in range(len(df))]
    inter_similarity(df, 'RDcluster' + str(i))

In [None]:
# DBSCAN
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=3).fit(list(df['named_entities']))
distances, indices = nbrs.kneighbors(list(df['named_entities']))
y = sorted(np.mean(distances, axis=1))
plt.plot(range(len(y)), y)
plt.ylim([0, 0.2])

In [None]:
# Based on named_entities

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['named_entities']))
    df['NEcluster' + str(i)] = clustering.labels_
    inter_similarity(df, 'NEcluster' + str(i))

In [None]:
# Based on linguistic features

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['linguistic_features']))
    df['lingcluster' + str(i)] = clustering.labels_
    inter_similarity(df, 'lingcluster' + str(i))

In [None]:
# Based on linguistic features and named_entities

for i in range(2, 9):
    clustering = KMeans(n_clusters=i, random_state=0).fit([list(i[1]['named_entities'])
                                                           + list(i[1]['linguistic_features'])
                                                           for i in df.iterrows()])
    
    df['cluster'] = clustering.labels_
    inter_similarity(df)

In [None]:
# Run everything

logs = {}

# Based on sBert
for i in glob('../data/*.p'):
    
    name = 'sBert_' + i[i.index('user_embedding_')+15:-2]
    
    mask = []

    with open(i, 'rb') as f:
        embeddings = pickle.load(f)

    vectors = []
    for user in tqdm(df['user_id'], total=len(df)):

        if user not in embeddings:
            vectors.append([0]*768)
            mask.append(False)
        else:
            vectors.append(np.array(embeddings[user]))
            mask.append(True)

    df['sBert'] = vectors
    df['mask'] = mask
    
    name += '[' + str(sum(mask)) + ']'
    
    logs[name] = {}
    
    print('File:', i)

    for i in range(2, 12):
        
        logs[name][i] = {}
        
        clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['sBert']))
        df[name + str(i)] = clustering.labels_
        fn_sim, pb_var, fact_var, stats = inter_similarity(df, name + str(i))
        
        logs[name][i]['fn'] = fn_sim
        logs[name][i]['pb'] = pb_var
        logs[name][i]['ff'] = fact_var
        logs[name][i]['stats'] = stats
        
# User2Vec
with open('../data/user2vec.pkl', 'rb') as f:
    embeddings = pickle.load(f)

vectors = []
mask = []
for user in tqdm(df['user_id'], total=len(df)):

    if user not in embeddings:
        vectors.append([0]*200)
        mask.append(False)
    else:
        vectors.append(np.array(embeddings[user]))
        mask.append(True)

df['mask'] = mask
df['user2vec'] = vectors

name = 'user2vec[' + str(sum(mask)) + ']'
logs[name] = {}

print('File:', i)

for i in range(2, 12):

    logs[name][i] = {}

    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['user2vec']))
    df[name + str(i)] = clustering.labels_
    fn_sim, pb_var, fact_var, stats = inter_similarity(df, name + str(i))

    logs[name][i]['fn'] = fn_sim
    logs[name][i]['pb'] = pb_var
    logs[name][i]['ff'] = fact_var
    logs[name][i]['stats'] = stats
    

# Named Entities and ling features
name = 'named_entities[' + str(len(df)) + ']'
logs[name] = {}
for i in range(2, 12):

    logs[name][i] = {}

    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['named_entities']))
    df[name + str(i)] = clustering.labels_
    fn_sim, pb_var, fact_var, stats = inter_similarity(df, name + str(i))

    logs[name][i]['fn'] = fn_sim
    logs[name][i]['pb'] = pb_var
    logs[name][i]['ff'] = fact_var
    logs[name][i]['stats'] = stats

name = 'linguistic_features[' + str(len(df)) + ']'
logs[name] = {}
for i in range(2, 12):

    logs[name][i] = {}

    clustering = KMeans(n_clusters=i, random_state=0).fit(list(df['linguistic_features']))
    df[name + str(i)] = clustering.labels_
    fn_sim, pb_var, fact_var, stats = inter_similarity(df, name + str(i))

    logs[name][i]['fn'] = fn_sim
    logs[name][i]['pb'] = pb_var
    logs[name][i]['ff'] = fact_var
    logs[name][i]['stats'] = stats

## Visualizing differneces of sBert fine tunings

In [None]:
rows = [i for i in logs.keys() if 'dicard' not in i]
sizes = list(logs[rows[0]].keys())

fn_df = pd.DataFrame(columns=sizes)
pb_df = pd.DataFrame(columns=sizes)
ff_df = pd.DataFrame(columns=sizes)

for r in rows:
    fn_df = fn_df.append({s:logs[r][s]['fn'] for s in sizes}, ignore_index=True)
    pb_df = pb_df.append({s:logs[r][s]['pb'] for s in sizes}, ignore_index=True)
    ff_df = ff_df.append({s:logs[r][s]['ff'] for s in sizes}, ignore_index=True)
    
fn_df.index = rows
pb_df.index = rows
ff_df.index = rows

In [None]:
ax = sns.heatmap(fn_df, cmap='turbo',linewidths=.5)
ax.set(xlabel='Number of clusters', ylabel='Embedding', title='Inter-Cluster similarity of fn and rn spreaders');

fig = ax.get_figure()
fig.savefig('../analysis/cluster_fn_sim_rebalanced_wo_disc.pdf', bbox_inches='tight')

In [None]:
ax = sns.heatmap(pb_df, cmap='turbo',linewidths=.5)
ax.set(xlabel='Number of clusters', ylabel='Embedding', title='Inter-Cluster political bias variance');

fig = ax.get_figure()
fig.savefig('../analysis/cluster_pol_bias_rebalanced_wo_disc.pdf', bbox_inches='tight')

In [None]:
ax = sns.heatmap(ff_df, cmap='turbo',linewidths=.5)
ax.set(xlabel='Number of clusters', ylabel='Embedding', title='Inter-Cluster factual factor variance');

fig = ax.get_figure()
fig.savefig('../analysis/cluster_fact_fac_rebalanced_wo_disc.pdf', bbox_inches='tight')

In [None]:
logs

## How to topics influence the different clusterings?

In [None]:
df

In [None]:
# Credit: https://stackoverflow.com/questions/46131572/making-a-non-overlapping-bubble-chart-in-matplotlib-circle-packing
class C():
    def __init__(self,r):
        self.N = len(r)
        self.x = np.ones((self.N,3))
        self.x[:,2] = r
        maxstep = 2*self.x[:,2].max()
        length = np.ceil(np.sqrt(self.N))
        grid = np.arange(0,length*maxstep,maxstep)
        gx,gy = np.meshgrid(grid,grid)
        self.x[:,0] = gx.flatten()[:self.N]
        self.x[:,1] = gy.flatten()[:self.N]
        self.x[:,:2] = self.x[:,:2] - np.mean(self.x[:,:2], axis=0)

        self.step = self.x[:,2].min()
        self.p = lambda x,y: np.sum((x**2+y**2)**2)
        self.E = self.energy()
        self.iter = 1.

    def minimize(self):
        while self.iter < 500*self.N:
            for i in range(self.N):
                rand = np.random.randn(2)*self.step/self.iter
                self.x[i,:2] += rand
                e = self.energy()
                if (e < self.E and self.isvalid(i)):
                    self.E = e
                    self.iter = 1.
                else:
                    self.x[i,:2] -= rand
                    self.iter += 1.

    def energy(self):
        return self.p(self.x[:,0], self.x[:,1])

    def distance(self,x1,x2):
        return np.sqrt((x1[0]-x2[0])**2+(x1[1]-x2[1])**2)-x1[2]-x2[2]

    def isvalid(self, i):
        for j in range(self.N):
            if i!=j: 
                if self.distance(self.x[i,:], self.x[j,:]) < 0:
                    return False
        return True

    def plot(self, ax):
        for i in range(self.N):
            circ = plt.Circle(self.x[i,:2],self.x[i,2] )
            ax.add_patch(circ)

In [None]:
def get_topic_dist(embedding):
    topics = ['general-political-debate', 'Vaccines', 'SARS-CoV-2', 
                  'womens-and-mens-rights', 'Gun-control', 'Climate-change',
                  '5G', 'Abortion']
    topic_dists = [[0]*len(topics) for _ in range(len(df[embedding].unique()))]
    for i in df[embedding].unique():
        cluster = df[df[embedding] == i]
        dic = dict(cluster['topic'].value_counts())
        for ind, t in enumerate(topics):
            if t in dic:
                topic_dists[i][ind] = dic[t]
    return topic_dists

def make_pie_plot(name, rating, fn, positions, topics):
    _topics = ['general-political-debate', 'Vaccines', 'SARS-CoV-2', 
                  'womens-and-mens-rights', 'Gun-control', 'Climate-change',
                  '5G', 'Abortion']
    _colors = ['orangered', 'lime', 'aqua', 'violet', 'gold', 'grey', 'blue', 'darkmagenta']
    
    n_clusters = len(positions)
    
    for i in range(n_clusters):
        plt.pie(topics[i], radius=positions[i][2]*0.95,
                center=positions[i][:2], colors=_colors, startangle=45,
               wedgeprops={'alpha': rating[i]})
        plt.gca().add_patch(plt.Circle(radius=positions[i][2]*0.35,
                xy=positions[i][:2], color='w'))
        plt.gca().add_patch(plt.Circle(radius=positions[i][2]*0.15,
                xy=positions[i][:2], facecolor='black' if fn[i] else 'white', edgecolor='white' if fn[i] else 'black'))
    plt.title('Topic distributions in ' + name + '\nwith ' + str(n_clusters) + 'clusters')
    
    l = plt.legend(_topics + ['real-news', 'fake-news'], bbox_to_anchor=(1, 1))
    news_legend = [['white', 'black'], ['black', 'white']]
    for ind, text in enumerate(l.legendHandles):
        if ind < len(_colors):
            text.set_facecolor(_colors[ind])
            text.set_alpha(1)
        else:
            text.set_facecolor(news_legend[ind - len(_colors)][0])
            text.set_edgecolor(news_legend[ind - len(_colors)][1])
            text.set_alpha(1)
    
    plt.autoscale()
    plt.savefig('cluster-topic-dist/' + name + str(len(sizes)) + '.pdf', bbox_inches='tight')
    plt.show()

In [None]:
for emb in logs:
    for size in logs[emb]:
        n = emb[:emb.index('[')]
        sizes = [i[0] for i in logs[emb][size]['stats']]
        rating = [i[1]/100 for i in logs[emb][size]['stats']]
        fn = [bool(i[2]) for i in logs[emb][size]['stats']]
        _s = np.log2(sizes)+1
        c = C(2*_s/np.sum(_s))
        c.minimize()
        topics = get_topic_dist(emb + str(size))
        make_pie_plot(n, rating, fn, c.x, topics)