In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
df = pd.read_excel("facebook_vaccination_preprocessed.xlsx")
df = df.drop(["Unnamed: 0"],axis=1)

In [3]:
df

Unnamed: 0,Text,Type,Emotional context,Author,Subscribers,Target,Target Subscribers,Timestamp
0,Кейт Міддлтон услід за чоловіком вакцинувалася...,Post,0,Igor Stasyk,419,Трускавецька Міська Лікарня Friends,5721,109431
1,В мене все прекрасно. Не бачу причини для вакц...,Comment,1,Maryan Lviv,17,Ruslana Lyzhychko,123124,109420
2,ЯСЕН ПЕНЬ:\nВАКЦИН ОТ ОРЗ НЕТ И БЫТЬ НЕ ...,Post,0,Irina Marchenko,0,Irina Marchenko,0,109416
3,ДЕЦ МОЗУ розмістив звіт про безпеку covid-щепл...,Repost,0,Анюта Сажина,0,Анюта Сажина,0,109411
4,Андрій Ігнатов аналіз на антитіла беруть з кро...,Comment,-1,Юля Павлина,472,Ruslana Lyzhychko,123124,109393
...,...,...,...,...,...,...,...,...
152046,", според The ​​New York Times, друг акушер-гин...",Repost,0,Игорь Палладин,4226,Игорь Палладин,4226,3
152047,Национальный консультативный совет по иммуниза...,Comment,0,Natalia Lavrenova,6090,Los Solomas / Солом‘янський район,38051,3
152048,(first_aid)(stop_sign)(first_aid)(stop_sign)(f...,Comment,-1,Наталія Юськів,622,"Людоньки, порадьте!",362291,2
152049,Сергей ПерфильевМихаил Чаленко включите гугл.,Comment,0,Александр Мирошник,342,Будни Торецка/Украина,13654,0


## Exploring some case

In [4]:
df_case=df.loc[df["Author"]=="Anastasia Litvinova"]
df_case

Unnamed: 0,Text,Type,Emotional context,Author,Subscribers,Target,Target Subscribers,Timestamp
109435,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,АНТИКВАРТАЛ,1858,15575
115856,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,Твій МАЙДАН - До ПЕРЕМОГИ,12057,14030
117068,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,МАЙДАН Є І БУДЕ ЗАВЖДИ .,2745,13738
117304,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,Зелений віслюк,701,13683
118680,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,"""УКРАЇНЦІ 25"" Полтавщина",628,12840
119751,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,Кропивницький Online,384,12485
119904,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,МОЇ БРОВАРИ,75014,12434
120749,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,#Я_ПОРОХОБОТ (пишаюсь цим!),21241,12165
121062,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,"Полтава,мій рідний край",1088,12029
121079,"Ось цей 18-літній мажор, який ганяє по Києву н...",Доповнений репост,0,Anastasia Litvinova,4716,Вибір Кременчука,671,12016


Features needed:
1. Activity frequency
2. Inactivity inertia
3. Out degree
4. Targets popularity
5. Katz centrality
6. Posts similarity
7. Self posting
8. Emotional context*
9. Commentary percentage

### Creating weighted directed graph



In [4]:
df_graph = df[["Author","Target"]].groupby(["Author","Target"]).size().reset_index(name="weight")

In [5]:
df_graph

Unnamed: 0,Author,Target,weight
0,"""Європейська Україна""","""Європейська Україна""",7
1,"""Європейська Україна""",Чернігів UA | Перлина Полісся,1
2,0512.com.ua - сайт города Николаева,0512.com.ua - сайт города Николаева,7
3,"0532.ua - сайт міста Полтави - новини, афіша, ...","0532.ua - сайт міста Полтави - новини, афіша, ...",7
4,"05366.com.ua - новости, афиша, справочник пред...","05366.com.ua - новости, афиша, справочник пред...",32
...,...,...,...
35245,татьяна кучукова,татьяна кучукова,9
35246,яков дядяев,яков дядяев,14
35247,ірина кондур,ірина кондур,6
35248,• АТН • Харьков • Новости •,• АТН • Харьков • Новости •,41


In [6]:
graph = nx.from_pandas_edgelist(df_graph,source="Author",target="Target",edge_attr=True,create_using=nx.DiGraph())

In [8]:
out_degrees = np.array(list(graph.out_degree()))

In [9]:
katz_centr = nx.katz_centrality(graph.reverse())

In [33]:
df_features = pd.DataFrame({"Author":out_degrees[:,0],"Out degree":out_degrees[:,1].astype(int),
                           "Katz centr":katz_centr.values()})
df_features = df_features.loc[df_features["Out degree"]>0]
df_features

Unnamed: 0,Author,Out degree,Katz centr
0,"""Європейська Україна""",2,0.007028
2,0512.com.ua - сайт города Николаева,1,0.006389
3,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389
4,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389
5,0564.ua - Сайт города Кривого Рога,2,0.007028
...,...,...,...
18902,сергей ахметов,1,0.006389
18903,татьяна кучукова,1,0.006389
18904,яков дядяев,1,0.006389
18905,ірина кондур,1,0.006389


In [11]:
def getActivityPattern(group,model):
    times = np.array(group[1]["Timestamp"])
    period = (times[0]-times[-1])+1
    frequency = len(times)/period
    inactivity = [[times[i-1]-times[i]] for i in range(1,len(times))]
    model.fit(inactivity)
    inertia = model.inertia_
    return frequency,inertia

In [12]:
authors = []
frequencies = []
inertias = []
model = KMeans(n_clusters=2)

groups = df[["Author","Timestamp"]].groupby(["Author"])
for group in tqdm(groups):
    authors.append(group[0])
    f,i = getActivityPattern(group,model)
    frequencies.append(f)
    inertias.append(i)

100%|████████████████████████████████████████████████████████████████████████████| 11058/11058 [06:17<00:00, 29.33it/s]


In [34]:
df_time = pd.DataFrame({"Author":authors,"Frequency":frequencies,"Inactivity_Inertia":inertias})
df_features = pd.merge(df_features,df_time,how="left",on=["Author"])

In [35]:
df_features.head()

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia
0,"""Європейська Україна""",2,0.007028,8.1e-05,184970700.0
1,0512.com.ua - сайт города Николаева,1,0.006389,7.6e-05,181463900.0
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,7.7e-05,169125800.0
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,31125000.0
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,78933940.0


In [36]:
df_features["Inactivity_Inertia"] = MinMaxScaler().fit_transform(df_features["Inactivity_Inertia"].values.reshape(-1,1))

In [37]:
df_features.head()

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia
0,"""Європейська Україна""",2,0.007028,8.1e-05,0.226993
1,0512.com.ua - сайт города Николаева,1,0.006389,7.6e-05,0.222689
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,7.7e-05,0.207548
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866


In [17]:
# Posts similarity


def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def docs_preprocess(texts):
    tokenized = []
    for text in tqdm(texts):
        tokenized.append(word_tokenize(str(text).lower()))
    
    return tokenized
    
def getSimilarity(data,model):
    vectors = []
    for text in data:
        preprocessed = word_tokenize(str(text).lower())
        vectors.append(model.infer_vector(preprocessed))
    similarities = []
    for i in range(len(vectors)):
        for j in range(i,len(vectors)):
            similarities.append(cosine(vectors[i],vectors[j]))
    return np.mean(similarities)


In [18]:
print("Training model...")
data = df["Text"]
texts = docs_preprocess(data)
print("Preprocessed")
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(texts)]
print("Tagged")
modelDoc = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 10)
print("Completed")

Training model...


100%|████████████████████████████████████████████████████████████████████████| 152051/152051 [01:13<00:00, 2059.12it/s]


Preprocessed
Tagged
Completed


In [19]:
authors = []
similarities = []
print("Compute similarities...")
groups = df[["Author","Text"]].groupby(["Author"])
for group in tqdm(groups):
    authors.append(group[0])
    s = getSimilarity(group[1]["Text"],modelDoc)
    similarities.append(s)

Compute similarities...


100%|████████████████████████████████████████████████████████████████████████████| 11058/11058 [04:19<00:00, 42.67it/s]


In [38]:
df_sim = pd.DataFrame({"Author":authors,"Posts similarity":similarities})
df_features = pd.merge(df_features,df_sim,how="left",on=["Author"])
df_features.head()

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity
0,"""Європейська Україна""",2,0.007028,8.1e-05,0.226993,0.680074
1,0512.com.ua - сайт города Николаева,1,0.006389,7.6e-05,0.222689,0.534414
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,7.7e-05,0.207548,0.595434
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196,0.318691
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866,0.544309


In [21]:
# Targets mean number of subscribers
# Authors number of subscribers

authors = []
target_subscribers = []
subscribers = []
groups = df[["Author","Subscribers","Target Subscribers"]].groupby(["Author"])
for group in groups:
    authors.append(group[0])
    target_subscribers.append(np.mean(group[1]["Target Subscribers"])+1)
    subscribers.append(np.mean(group[1]["Subscribers"])+1)

In [39]:
df_subs = pd.DataFrame({"Author":authors,"Target Subscribers":np.log10(target_subscribers),"Subscribers":np.log10(subscribers)})
df_features = pd.merge(df_features,df_subs,how="left",on=["Author"])

In [40]:
df_features

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity,Target Subscribers,Subscribers
0,"""Європейська Україна""",2,0.007028,0.000081,0.226993,0.680074,4.028149,3.569140
1,0512.com.ua - сайт города Николаева,1,0.006389,0.000076,0.222689,0.534414,4.196425,4.196425
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,0.000077,0.207548,0.595434,4.144419,4.144419
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196,0.318691,3.679519,3.679519
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866,0.544309,4.532544,4.539139
...,...,...,...,...,...,...,...,...
11053,сергей ахметов,1,0.006389,0.000212,0.155005,0.443194,3.414137,3.414137
11054,татьяна кучукова,1,0.006389,0.000137,0.123643,0.729746,1.892095,1.892095
11055,яков дядяев,1,0.006389,0.000229,0.069707,0.567366,2.271842,2.271842
11056,ірина кондур,1,0.006389,0.000209,0.018872,0.803657,0.000000,0.000000


In [41]:
# Mean Emotional context

authors = []
emotion = []
groups = df[["Author","Emotional context"]].groupby(["Author"])
for group in groups:
    authors.append(group[0])
    emotion.append(np.mean(group[1]["Emotional context"]))
    
df_emo = pd.DataFrame({"Author":authors,"Emotion":emotion})
df_features = pd.merge(df_features,df_emo,how="left",on=["Author"])
df_features.head()

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity,Target Subscribers,Subscribers,Emotion
0,"""Європейська Україна""",2,0.007028,8.1e-05,0.226993,0.680074,4.028149,3.56914,0.375
1,0512.com.ua - сайт города Николаева,1,0.006389,7.6e-05,0.222689,0.534414,4.196425,4.196425,0.142857
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,7.7e-05,0.207548,0.595434,4.144419,4.144419,0.142857
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196,0.318691,3.679519,3.679519,0.375
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866,0.544309,4.532544,4.539139,0.217391


In [25]:
# Visiting coefficient

authors = []
selfposts = []
outposts = []
totals = []
groups = df[["Author","Target"]].groupby(["Author"])
for g in tqdm(groups):
    authors.append(g[0])
    total = len(g[1])
    self = 0
    out = 0
    for pair in g[1].iterrows():
        if pair[1]["Author"] == pair[1]["Target"]:
            self += 1 
        else:
            out += 1
    selfposts.append(self)
    outposts.append(out)
    totals.append(total)


100%|██████████████████████████████████████████████████████████████████████████| 11058/11058 [00:06<00:00, 1638.80it/s]


In [42]:
df_posts = pd.DataFrame({"Author":authors,"Total Posts":totals,"Selfposts":selfposts,"Outposts":outposts})
df_features = pd.merge(df_features,df_posts,how="left",on=["Author"])
df_features

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity,Target Subscribers,Subscribers,Emotion,Total Posts,Selfposts,Outposts
0,"""Європейська Україна""",2,0.007028,0.000081,0.226993,0.680074,4.028149,3.569140,0.375000,8,7,1
1,0512.com.ua - сайт города Николаева,1,0.006389,0.000076,0.222689,0.534414,4.196425,4.196425,0.142857,7,7,0
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,0.000077,0.207548,0.595434,4.144419,4.144419,0.142857,7,7,0
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196,0.318691,3.679519,3.679519,0.375000,32,32,0
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866,0.544309,4.532544,4.539139,0.217391,23,22,1
...,...,...,...,...,...,...,...,...,...,...,...,...
11053,сергей ахметов,1,0.006389,0.000212,0.155005,0.443194,3.414137,3.414137,-0.260870,23,23,0
11054,татьяна кучукова,1,0.006389,0.000137,0.123643,0.729746,1.892095,1.892095,-0.444444,9,9,0
11055,яков дядяев,1,0.006389,0.000229,0.069707,0.567366,2.271842,2.271842,-0.071429,14,14,0
11056,ірина кондур,1,0.006389,0.000209,0.018872,0.803657,0.000000,0.000000,0.500000,6,6,0


In [44]:
df_features["Visiting coefficient"] = df_features["Outposts"]/(df_features["Outposts"]+df_features["Selfposts"])
df_features = df_features.drop(["Selfposts","Outposts"],axis=1)
df_features

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity,Target Subscribers,Subscribers,Emotion,Total Posts,Commentary percentage,Visiting coefficient
0,"""Європейська Україна""",2,0.007028,0.000081,0.226993,0.680074,4.028149,3.569140,0.375000,8,0.000000,0.125000
1,0512.com.ua - сайт города Николаева,1,0.006389,0.000076,0.222689,0.534414,4.196425,4.196425,0.142857,7,0.000000,0.000000
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,0.000077,0.207548,0.595434,4.144419,4.144419,0.142857,7,0.142857,0.000000
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196,0.318691,3.679519,3.679519,0.375000,32,0.000000,0.000000
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866,0.544309,4.532544,4.539139,0.217391,23,0.000000,0.043478
...,...,...,...,...,...,...,...,...,...,...,...,...
11053,сергей ахметов,1,0.006389,0.000212,0.155005,0.443194,3.414137,3.414137,-0.260870,23,0.000000,0.000000
11054,татьяна кучукова,1,0.006389,0.000137,0.123643,0.729746,1.892095,1.892095,-0.444444,9,0.000000,0.000000
11055,яков дядяев,1,0.006389,0.000229,0.069707,0.567366,2.271842,2.271842,-0.071429,14,0.071429,0.000000
11056,ірина кондур,1,0.006389,0.000209,0.018872,0.803657,0.000000,0.000000,0.500000,6,0.000000,0.000000


In [31]:
# Commentary percentage
authors = []
commentary_perc = []
groups = df[["Author","Type"]].groupby(["Author"])
for group in groups:
    authors.append(group[0])
    total = len(group[1])
    com = len(group[1].loc[group[1]["Type"]=="Comment"])
    commentary_perc.append(com/total)


In [43]:
df_com = pd.DataFrame({"Author":authors,"Commentary percentage":commentary_perc})
df_features = pd.merge(df_features,df_com,how="left",on=["Author"])
df_features

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity,Target Subscribers,Subscribers,Emotion,Total Posts,Selfposts,Outposts,Commentary percentage
0,"""Європейська Україна""",2,0.007028,0.000081,0.226993,0.680074,4.028149,3.569140,0.375000,8,7,1,0.000000
1,0512.com.ua - сайт города Николаева,1,0.006389,0.000076,0.222689,0.534414,4.196425,4.196425,0.142857,7,7,0,0.000000
2,"0532.ua - сайт міста Полтави - новини, афіша, ...",1,0.006389,0.000077,0.207548,0.595434,4.144419,4.144419,0.142857,7,7,0,0.142857
3,"05366.com.ua - новости, афиша, справочник пред...",1,0.006389,0.000676,0.038196,0.318691,3.679519,3.679519,0.375000,32,32,0,0.000000
4,0564.ua - Сайт города Кривого Рога,2,0.007028,0.000429,0.096866,0.544309,4.532544,4.539139,0.217391,23,22,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11053,сергей ахметов,1,0.006389,0.000212,0.155005,0.443194,3.414137,3.414137,-0.260870,23,23,0,0.000000
11054,татьяна кучукова,1,0.006389,0.000137,0.123643,0.729746,1.892095,1.892095,-0.444444,9,9,0,0.000000
11055,яков дядяев,1,0.006389,0.000229,0.069707,0.567366,2.271842,2.271842,-0.071429,14,14,0,0.071429
11056,ірина кондур,1,0.006389,0.000209,0.018872,0.803657,0.000000,0.000000,0.500000,6,6,0,0.000000


### Filtering

In [34]:
#df_filtered = df_features.loc[df_features["Out degree"] > 2]
#df_filtered

Unnamed: 0,Author,Out degree,Katz centr,Frequency,Inactivity_Inertia,Posts similarity,Target Subscribers,Subscribers,Emotion,Total Posts,Visiting coefficient,Commentary percentage
25,Abramycheva Olga,4,0.008178,0.000154,0.033380,0.551113,4.945640,2.037426,0.333333,6,1.000000,1.000000
28,Advocat Oleh Leontyev,4,0.009000,0.000150,0.121900,0.311276,3.935343,3.449941,-0.125000,8,0.625000,0.625000
29,Євген Магда,4,0.008590,0.000376,0.111423,0.467046,4.228020,4.289901,-0.230769,26,0.153846,0.615385
30,Артур Нискубин,5,0.009158,0.000122,0.092905,0.518000,4.163293,4.021933,0.000000,7,0.714286,0.714286
31,Al Kopt,3,0.007539,0.000133,0.231999,0.434548,5.637169,0.000000,-0.214286,14,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
11028,Ярослав Иванченко,6,0.009328,0.000094,0.112010,0.439195,5.973920,2.693727,-0.428571,7,1.000000,1.000000
11030,Ярослав Коваль,5,0.008952,0.000155,0.101151,0.511693,4.823154,0.000000,0.500000,8,1.000000,0.625000
11031,Ярослав Кондришин,3,0.007603,0.000278,0.294026,0.399642,5.562610,2.089905,-0.222222,18,1.000000,1.000000
11032,Ярослав Мудрий,9,0.010925,0.000140,0.020738,0.863573,4.851476,1.106455,-0.555556,9,1.000000,0.111111


### Saving

In [45]:
df_features.to_excel("facebook_features.xlsx")