In [9]:
import numpy as np
import scipy as sps
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Load dataset

In [35]:
data = pd.read_csv("dataset/csv_dataset_with_locations_new.txt", sep="|", dtype={"full_text": str, "formatted_text": str, "event_id":str})
data['new_index'] = data.index


In [139]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580898 entries, 0 to 580897
Data columns (total 21 columns):
id                       580898 non-null int64
full_text                580898 non-null object
formatted_text           579158 non-null object
event_id                 580898 non-null object
created_at               580898 non-null object
user_id                  580898 non-null int64
in_reply_to_status_id    16003 non-null float64
hash_tags                99068 non-null object
user_mentions            119790 non-null object
retweet_count            580898 non-null int64
favorite_count           580898 non-null int64
possibly_sensitive       580898 non-null bool
place_name               5931 non-null object
place_type               5931 non-null object
country_code             5926 non-null object
coordinates              5931 non-null object
is_fake                  580898 non-null int64
new_index                580898 non-null int64
cleaned_created_at       580898 non-null f

# Cleanup timings (str to int) and generate partitions

In [37]:
from datetime import datetime
import datetime

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
import sklearn.cluster as skcluster


class FakeNewsDetector:
    def __init__(self, dataframe, article_id_col = "event_id", cleaned_timings_col = None):

        self.dataframe = dataframe
        
        self.dataframe[article_id_col] = self.dataframe[article_id_col].astype(str)#temporaneo, casto tutto in stringa
        
        self.article_id_col = article_id_col
        self.cleaned_timings_col = ""
        if cleaned_timings_col is None:
            self.cleaned_timings_col = "cleaned_created_at"
            self.generate_clean_timings()
        else:
            self.cleaned_timings_col = cleaned_timings_col


    def generate_clean_timings(self, source_col = "created_at", date_parse_str = "%a %b %d %H:%M:%S +%f %Y"):
        #print(self.dataframe[source_col])
        self.dataframe[self.cleaned_timings_col] = self.dataframe[source_col].apply(lambda x: (datetime.datetime.strptime(x,date_parse_str) - datetime.datetime.utcfromtimestamp(0)).total_seconds() * 1000)

    def get_time_article(self, id_article):
        arr_times = self.dataframe[self.dataframe[self.article_id_col] == id_article][self.cleaned_timings_col].values
        min_date = min(arr_times)
        arr_times = arr_times - min_date
        return arr_times
    
    def partition (self, sequence, eps = 86400000):
        min_date = min(sequence)
        sequence = sequence - min_date
        labels = []
        for item in sequence:
            labels+=[item//eps]
        return labels

    def partition_table(self, new_col = "partition", eps = 86400000):
        self.dataframe[new_col] = self.dataframe.groupby("event_id")["cleaned_created_at"].transform(lambda x: self.partition(x, eps = eps))

    def calculate_delta_t(self, delta_t_col = "delta_t"):
        arr_article_ids = self.dataframe[self.article_id_col].values
        #np.char.mod('%d', arr_article_ids) #temporaneo, casto tutto in stringa
        all_articles = np.unique(arr_article_ids)
        for i in range(len(all_articles)):#per tutti gli articoli
            
            article_id = all_articles[i]
            
            if i%20 == 0:
                print(i)
            sub_c = self.dataframe[self.dataframe[self.article_id_col] == article_id] #tweet di un articolo
            partition_list = np.sort(np.unique(sub_c["partition"].values))
            for j in range(len(partition_list)):#per tutte le partizioni (1,5,7,...)
                current_t = min(sub_c[sub_c["partition"] == partition_list[j]]["cleaned_created_at"])#minimo tempo tra elementi della stessa partizione
                result = 0
                if j > 0:
                    last_t = max(sub_c[sub_c["partition"] == partition_list[j-1]]["cleaned_created_at"])#massimo tempo partizione precedente
                    result = current_t - last_t
                self.dataframe.loc[((self.dataframe[self.article_id_col] == article_id) & (self.dataframe["partition"] == partition_list[j])), delta_t_col] = result

In [38]:
generate_patitions = FakeNewsDetector(data)
generate_patitions.partition_table(eps = 86400000)
generate_patitions.calculate_delta_t()

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280


### test delta_t

In [39]:

test = 3
check_min = min(data[(data["event_id"] == "E695") & (data["partition"] == test)]["cleaned_created_at"].values)
check_max = max(data[(data["event_id"] == "E695") & (data["partition"] == (test-1))]["cleaned_created_at"].values)
print(check_min - check_max)
data[(data["event_id"] == "E695") & (data["partition"] == test)][["cleaned_created_at", "delta_t"]].head()

24000.0


Unnamed: 0,cleaned_created_at,delta_t
344,1435691000000.0,24000.0
1101,1435677000000.0,24000.0
1116,1435721000000.0,24000.0
1583,1435682000000.0,24000.0
1792,1435683000000.0,24000.0


# Extract text features

## Removing dots etc...

In [29]:
txt = data["formatted_text"].values

txt[pd.isnull(txt)] = "" # remove NaN

from string import punctuation
def clean_text(arr_text):
    for i in range(len(arr_text)):
        tweet = arr_text[i]
        tweet = tweet.lower()
        tweet = tweet.translate(str.maketrans('', '', punctuation))
        arr_text[i] = tweet
        
clean_text(txt)


## Generate labels and input vectors

In [30]:
labels = list(range(len(txt))) 
source = []
for i in range(len(txt)):
    source+=[TaggedDocument(words=txt[i].split(), tags=[labels[i]])]

## Build & train Doc2Vec

In [31]:
model = Doc2Vec(min_count=0, window=10, alpha=0.025, vector_size=30, sample=1e-6, negative=3, workers=8) # vector_size: max 100
model.build_vocab(source)
model.train(source, total_examples=len(source), epochs=20)


### Test the model

In [32]:
model.docvecs[580897]#questa è la label!

array([-0.02908463,  0.0099935 ,  0.06560788, -0.01156352, -0.06217754,
       -0.01758923, -0.02365584, -0.05299353,  0.00040468,  0.00966332,
       -0.04592134,  0.01084215, -0.02583843,  0.03839766,  0.01072466,
       -0.03222507, -0.00795925,  0.06455176, -0.04706771, -0.03479892,
        0.04965467,  0.03074685, -0.040064  , -0.00859729, -0.00158696,
        0.02913547, -0.01197857, -0.01281282,  0.00706169,  0.00486092],
      dtype=float32)

In [33]:
model.most_similar('islam')

  if __name__ == '__main__':


[('ntb', 0.9535197019577026),
 ('middle', 0.9528024196624756),
 ('tennessee', 0.9401465654373169),
 ('forcing', 0.9359233379364014),
 ('“allah', 0.9323118925094604),
 ('god”', 0.930834174156189),
 ('districts', 0.9165898561477661),
 ('students', 0.9102245569229126),
 ('lessons', 0.9078390002250671),
 ('nonmuslim', 0.9063861966133118)]

### Save (and load the Doc2Vec model)

In [14]:
#model.save('./model_doc2vec.d2v')
#model = Doc2Vec.load('./model_doc2vec.d2v')

## Generate table text features

In [40]:
#GENERATE TWEET FEATURES DATA FRAME
list_text_features = []
for i in labels:
    indexed_arr = [i]+list(model.docvecs[i])
    list_text_features+=[indexed_arr]
    
n_features = len(list_text_features[0])-1 #escludo gli id
feature_text_tweet = pd.DataFrame(list_text_features, columns=["id"]+list(range(n_features)))

#JOIN
new_data = data.merge(feature_text_tweet, left_on='new_index', right_on='id')

In [44]:
new_data[["id_x", "id_y", "new_index"]].head()

Unnamed: 0,id_x,id_y,new_index
0,615868085090676737,0,0
1,14619605813,1,1
2,644875384622829568,2,2
3,647483888684498944,3,3
4,608681780812652544,4,4


# Generate tables with features

In [45]:
partiton_count = new_data.groupby(["event_id", "partition"])["id_x"].agg(['count']).reset_index()
delta_t = new_data[["event_id", "partition", "delta_t"]].drop_duplicates(["event_id", "partition"])
#model.docvecs[580897]#questa è la label!

text_f = new_data.groupby(["event_id", "partition"])[list(range(n_features))].agg(['mean']).stack().reset_index()#(level=2)

In [46]:
partiton_count.head()

Unnamed: 0,event_id,partition,count
0,100181373,0.0,1
1,100181373,5.0,1
2,100181373,6.0,1
3,100181373,31.0,5
4,100181373,33.0,1


In [47]:
delta_t.head()

Unnamed: 0,event_id,partition,delta_t
0,E695,2.0,229000.0
1,TM859,0.0,0.0
2,E391,1.0,67000.0
3,E391,8.0,162000.0
4,E768,3.0,24000.0


In [48]:
text_f.head()

Unnamed: 0,event_id,partition,level_2,0,1,2,3,4,5,6,...,20,21,22,23,24,25,26,27,28,29
0,100181373,0.0,mean,-0.022518,0.047505,0.092737,-0.045733,-0.059689,-0.038701,-0.010562,...,0.071207,0.018048,-0.035854,-0.026587,-0.027098,0.016102,-0.035582,0.013511,0.016985,0.001332
1,100181373,5.0,mean,-0.025221,0.026891,0.103486,-0.043659,-0.084777,-0.040175,-0.01829,...,0.059865,0.043241,-0.068486,-0.023676,-0.005514,0.011625,-0.039691,0.012825,0.01758,0.0196
2,100181373,6.0,mean,-0.014814,0.036065,0.042713,-0.0348,-0.042156,-0.028338,0.002866,...,0.020164,0.015986,-0.018272,-0.018493,-0.020942,0.027301,-0.033386,0.007139,0.004258,0.009764
3,100181373,31.0,mean,-0.013507,0.002556,0.026975,-0.006886,-0.016222,-0.002986,-0.006419,...,0.009164,-0.000853,-0.005217,-1.2e-05,0.000193,0.009592,-0.007266,-0.002209,0.002799,0.003681
4,100181373,33.0,mean,-0.023455,0.045197,0.167588,-0.039668,-0.116954,-0.050277,-0.044998,...,0.104689,0.034397,-0.074943,-0.042054,-0.031587,0.035794,-0.065205,0.007813,0.023701,-0.006918


# Generate merged feature table

In [49]:
final_features = partiton_count.merge(delta_t, left_on=["event_id", "partition"], right_on=["event_id", "partition"]).merge(text_f, left_on=["event_id", "partition"], right_on=["event_id", "partition"])
final_features = final_features.drop("level_2", 1)

In [53]:
final_features[final_features["event_id"] == "100181373"].head(10)

Unnamed: 0,event_id,partition,count,delta_t,0,1,2,3,4,5,...,20,21,22,23,24,25,26,27,28,29
0,100181373,0.0,1,0.0,-0.022518,0.047505,0.092737,-0.045733,-0.059689,-0.038701,...,0.071207,0.018048,-0.035854,-0.026587,-0.027098,0.016102,-0.035582,0.013511,0.016985,0.001332
1,100181373,5.0,1,467729000.0,-0.025221,0.026891,0.103486,-0.043659,-0.084777,-0.040175,...,0.059865,0.043241,-0.068486,-0.023676,-0.005514,0.011625,-0.039691,0.012825,0.01758,0.0196
2,100181373,6.0,1,84167000.0,-0.014814,0.036065,0.042713,-0.0348,-0.042156,-0.028338,...,0.020164,0.015986,-0.018272,-0.018493,-0.020942,0.027301,-0.033386,0.007139,0.004258,0.009764
3,100181373,31.0,5,2182941000.0,-0.013507,0.002556,0.026975,-0.006886,-0.016222,-0.002986,...,0.009164,-0.000853,-0.005217,-1.2e-05,0.000193,0.009592,-0.007266,-0.002209,0.002799,0.003681
4,100181373,33.0,1,107492000.0,-0.023455,0.045197,0.167588,-0.039668,-0.116954,-0.050277,...,0.104689,0.034397,-0.074943,-0.042054,-0.031587,0.035794,-0.065205,0.007813,0.023701,-0.006918
5,100181373,34.0,2,139282000.0,-0.027763,0.042889,0.095184,-0.029141,-0.091039,-0.034855,...,0.083066,0.040125,-0.041414,-0.022344,-0.021463,0.032813,-0.039732,-0.009077,0.022509,0.007685
6,100181373,35.0,4,42354000.0,-0.026991,0.027644,0.091878,-0.039281,-0.074368,-0.033329,...,0.070048,0.025091,-0.038769,-0.020834,-0.012634,0.024344,-0.028386,0.005637,0.013111,0.008937
7,100181373,36.0,1,57283000.0,-0.025648,0.002411,0.034441,-0.023418,-0.018129,-0.02041,...,0.022113,0.026436,-0.004847,0.00227,-0.009337,-2.3e-05,0.001549,-0.014719,-0.007308,0.006606
8,100181373,43.0,11,578918000.0,-0.02917,0.044081,0.12485,-0.04346,-0.095813,-0.038909,...,0.081424,0.039304,-0.053413,-0.030209,-0.01545,0.029832,-0.04097,-0.004571,0.009589,0.008217
9,100181373,44.0,2,46526000.0,-0.019288,0.029581,0.078644,-0.031221,-0.076826,-0.02308,...,0.067931,0.029459,-0.029845,-0.02132,-0.014597,0.018471,-0.025617,-0.005995,0.001518,-0.002153


## re-mapping event_id

In [69]:
article_id_conversion = final_features["event_id"].drop_duplicates().reset_index().reset_index()[["level_0", "event_id"]]
article_id_conversion.columns = ['new_article_id', 'event_id']
x_table = final_features.merge(article_id_conversion, left_on = "event_id", right_on="event_id")
x_table.head()


Unnamed: 0,event_id,partition,count,delta_t,0,1,2,3,4,5,...,21,22,23,24,25,26,27,28,29,new_article_id
0,100181373,0.0,1,0.0,-0.022518,0.047505,0.092737,-0.045733,-0.059689,-0.038701,...,0.018048,-0.035854,-0.026587,-0.027098,0.016102,-0.035582,0.013511,0.016985,0.001332,0
1,100181373,5.0,1,467729000.0,-0.025221,0.026891,0.103486,-0.043659,-0.084777,-0.040175,...,0.043241,-0.068486,-0.023676,-0.005514,0.011625,-0.039691,0.012825,0.01758,0.0196,0
2,100181373,6.0,1,84167000.0,-0.014814,0.036065,0.042713,-0.0348,-0.042156,-0.028338,...,0.015986,-0.018272,-0.018493,-0.020942,0.027301,-0.033386,0.007139,0.004258,0.009764,0
3,100181373,31.0,5,2182941000.0,-0.013507,0.002556,0.026975,-0.006886,-0.016222,-0.002986,...,-0.000853,-0.005217,-1.2e-05,0.000193,0.009592,-0.007266,-0.002209,0.002799,0.003681,0
4,100181373,33.0,1,107492000.0,-0.023455,0.045197,0.167588,-0.039668,-0.116954,-0.050277,...,0.034397,-0.074943,-0.042054,-0.031587,0.035794,-0.065205,0.007813,0.023701,-0.006918,0


In [124]:
x_table[x_table["new_article_id"] == 1]

Unnamed: 0,event_id,partition,count,delta_t,0,1,2,3,4,5,...,21,22,23,24,25,26,27,28,29,new_article_id
31,1006199560,0,3,0.0,-0.003904,0.024791,0.06028,-0.01601,-0.035355,-0.012838,...,0.007661,-0.01556,-0.014516,-0.00442,0.014011,-0.016396,-0.005914,0.015989,0.007306,1
32,1006199560,1,1,50643000.0,-0.013822,0.03138,0.090787,-0.026124,-0.056607,-0.025736,...,0.021347,-0.053987,-0.00824,-0.000159,0.017153,-0.036844,0.006291,-0.000648,-0.001821,1
33,1006199560,2,2,122161000.0,-0.062233,0.086082,0.234532,-0.06719,-0.187845,-0.067669,...,0.079697,-0.112876,-0.047116,-0.047334,0.06224,-0.076668,0.006226,0.043952,0.002654,1


## getting n.steps

In [116]:
n_steps = int(max(x_table["partition"]))+1

# Now generate input matrix for RNN.

## This matrix will be X matrix (samples = #events, features = text_features + 2, input_steps)

In [117]:
x_matrix = np.zeros((max(x_table["new_article_id"].values)+1, n_features+2 , n_steps))

In [134]:
x_matrix.shape


(1296, 32, 2393)

In [136]:
test = x_table[x_table["new_article_id"] == 0][["partition", "count"]] # partition, count
x_table["partition"] = x_table["partition"].astype(int)
print(test.head())
x_matrix[x_table["new_article_id"].values, 0, x_table["partition"].values] = x_table["count"].values # sample 0, feature 0 = count x partition
x_matrix[x_table["new_article_id"].values, 1, x_table["partition"].values] = x_table["delta_t"].values

for i in range (n_features):
    #print(i)
    x_matrix[x_table["new_article_id"].values, int(i+2), x_table["partition"].values] = x_table[i].values



   partition  count
0          0      1
1          5      1
2          6      1
3         31      5
4         33      1


In [149]:
np.save("x_matrix.npy", x_matrix)

# Generate y matrix

In [150]:
y_table = data[["event_id", "is_fake"]].merge(x_table[["new_article_id", "event_id"]], left_on="event_id", right_on="event_id").drop_duplicates()

In [151]:
y_matrix = np.zeros((max(y_table["new_article_id"].values)+1, 1))

In [153]:
y_matrix[y_table["new_article_id"].values, 0] = y_table["is_fake"].values

In [154]:
np.save("y_matrix.npy", y_matrix)

In [158]:
y_matrix.shape[0]

1296

# Split train-test

In [161]:
row_index = np.arange(0, y_matrix.shape[0])
np.random.shuffle(row_index)

ind_sep_train_test = int(len(row_index)*0.8)
train_indexes = row_index[:ind_sep_train_test]
test_indexes = row_index[ind_sep_train_test:]

x_train = x_matrix[train_indexes,:,:]
x_test = x_matrix[test_indexes,:,:]
y_train = y_matrix[train_indexes,:]
y_test = y_matrix[test_indexes,:]


In [164]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

np.save("x_train.npy", x_train)
np.save("x_test.npy", x_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)


(1036, 32, 2393)
(260, 32, 2393)
(1036, 1)
(260, 1)
