In [1]:
import numpy as np
import scipy as sps
import pandas as pd
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Load dataset

In [2]:
data = pd.read_csv("dataset/csv_dataset_with_locations_new.txt", sep="|", dtype={"full_text": str, "formatted_text": str, "event_id":str})

## re-mapping user and events

In [3]:
list_event_id = sorted(set(data["event_id"]))
list_user_id = sorted(set(data["user_id"]))

data["new_event_id"] = data["event_id"].apply(lambda x: list_event_id.index(x))
#data['new_event_id'] = data.index
print("1 of 2")
data["new_user_id"] = data["user_id"].apply(lambda x: list_user_id.index(x))
print("2 of 2")
data['new_id'] = data.index
data[['new_event_id']] = data[['new_event_id']].apply(pd.to_numeric)
data[['new_user_id']] = data[['new_user_id']].apply(pd.to_numeric)

1 of 2
2 of 2


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580898 entries, 0 to 580897
Data columns (total 23 columns):
id                       580898 non-null int64
full_text                580898 non-null object
formatted_text           579158 non-null object
event_id                 580898 non-null object
created_at               580898 non-null object
user_id                  580898 non-null int64
in_reply_to_status_id    16003 non-null float64
hash_tags                99068 non-null object
user_mentions            119790 non-null object
retweet_count            580898 non-null int64
favorite_count           580898 non-null int64
possibly_sensitive       580898 non-null bool
place_name               5931 non-null object
place_type               5931 non-null object
country_code             5926 non-null object
coordinates              5931 non-null object
is_fake                  580898 non-null int64
new_event_id             580898 non-null int64
new_user_id              580898 non-null i

In [5]:
data.head(3)

Unnamed: 0,id,full_text,formatted_text,event_id,created_at,user_id,in_reply_to_status_id,hash_tags,user_mentions,retweet_count,favorite_count,possibly_sensitive,place_name,place_type,country_code,coordinates,is_fake,new_event_id,new_user_id,new_id
0,615868085090676737,"Walmart bakes ISIS cake, rejects Confederate f...","Walmart bakes ISIS cake, rejects Confederate f...",E695,Tue Jun 30 13:02:48 +0000 2015,1643045892,,,,0,0,False,,,,,0,947,193707,0
1,14619605813,UK bans doctor who linked autism to vaccine: B...,UK bans doctor who linked autism to vaccine: B...,TM859,Mon May 24 12:23:36 +0000 2010,124779476,,,,0,0,False,,,,,0,1273,106919,1
2,644875384622829568,Doritos unveils rainbow chips to support LGBT ...,Doritos unveils rainbow chips to support LGBT ...,E391,Fri Sep 18 14:07:27 +0000 2015,22970986,,,,3,6,False,,,,,0,623,44897,2


# Cleanup timings (str to int) and generate partitions

In [6]:
from datetime import datetime
import datetime

import numpy as np
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
import sklearn.cluster as skcluster


class FakeNewsDetector:
    def __init__(self, dataframe, article_id_col = "new_event_id", cleaned_timings_col = None):

        self.dataframe = dataframe
        
        self.dataframe[article_id_col] = self.dataframe[article_id_col].astype(str)#temporaneo, casto tutto in stringa
        
        self.article_id_col = article_id_col
        self.cleaned_timings_col = ""
        if cleaned_timings_col is None:
            self.cleaned_timings_col = "cleaned_created_at"
            self.generate_clean_timings()
        else:
            self.cleaned_timings_col = cleaned_timings_col


    def generate_clean_timings(self, source_col = "created_at", date_parse_str = "%a %b %d %H:%M:%S +%f %Y"):
        #print(self.dataframe[source_col])
        self.dataframe[self.cleaned_timings_col] = self.dataframe[source_col].apply(lambda x: (datetime.datetime.strptime(x,date_parse_str) - datetime.datetime.utcfromtimestamp(0)).total_seconds() * 1000)

    def get_time_article(self, id_article):
        arr_times = self.dataframe[self.dataframe[self.article_id_col] == id_article][self.cleaned_timings_col].values
        min_date = min(arr_times)
        arr_times = arr_times - min_date
        return arr_times
    
    def partition (self, sequence, eps = 86400000):
        min_date = min(sequence)
        sequence = sequence - min_date
        labels = []
        for item in sequence:
            labels+=[item//eps]
        return labels

    def partition_table(self, new_col = "partition", eps = 86400000):
        self.dataframe[new_col] = self.dataframe.groupby("new_event_id")["cleaned_created_at"].transform(lambda x: self.partition(x, eps = eps))

    def calculate_delta_t(self, delta_t_col = "delta_t"):
        arr_article_ids = self.dataframe[self.article_id_col].values
        #np.char.mod('%d', arr_article_ids) #temporaneo, casto tutto in stringa
        all_articles = np.unique(arr_article_ids)
        for i in range(len(all_articles)):#per tutti gli articoli
            
            article_id = all_articles[i]
            
            if i%20 == 0:
                print(i)
            sub_c = self.dataframe[self.dataframe[self.article_id_col] == article_id] #tweet di un articolo
            partition_list = np.sort(np.unique(sub_c["partition"].values))
            for j in range(len(partition_list)):#per tutte le partizioni (1,5,7,...)
                current_t = min(sub_c[sub_c["partition"] == partition_list[j]]["cleaned_created_at"])#minimo tempo tra elementi della stessa partizione
                result = 0
                if j > 0:
                    last_t = max(sub_c[sub_c["partition"] == partition_list[j-1]]["cleaned_created_at"])#massimo tempo partizione precedente
                    result = current_t - last_t
                self.dataframe.loc[((self.dataframe[self.article_id_col] == article_id) & (self.dataframe["partition"] == partition_list[j])), delta_t_col] = result

In [7]:
generate_patitions = FakeNewsDetector(data)
generate_patitions.partition_table(eps = 43200000)#eps = 86400000
generate_patitions.calculate_delta_t()

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280


### test delta_t

In [17]:
#data[['new_event_id']] = data[['new_event_id']].apply(pd.to_numeric)
#data[['new_user_id']] = data[['new_user_id']].apply(pd.to_numeric)
#data[['new_id']] = data[['new_id']].apply(pd.to_numeric)

data[data["new_event_id"] == 0].head(3)

Unnamed: 0,id,full_text,formatted_text,event_id,created_at,user_id,in_reply_to_status_id,hash_tags,user_mentions,retweet_count,...,place_type,country_code,coordinates,is_fake,new_event_id,new_user_id,new_id,cleaned_created_at,partition,delta_t
539675,689475536423260160,That's Why You Gotta Be Careful Who You Lay Wi...,That's Why You Gotta Be Careful Who You Lay Wi...,100181373,Tue Jan 19 15:52:32 +0000 2016,161564227,,Cheaters,,0,...,,,,1,0,115408,539675,1453219000000.0,686.0,31839000.0
540080,580893330256437249,New York Man Wanted For Infecting 240 Men And ...,New York Man Wanted For Infecting 240 Men And ...,100181373,Thu Mar 26 00:45:37 +0000 2015,29465312,,,,0,...,,,,1,0,56616,540080,1427331000000.0,87.0,2997000.0
540686,839715245602009090,New York Man Wanted For Infecting 240 Men And ...,New York Man Wanted For Infecting 240 Men And ...,100181373,Thu Mar 09 05:51:30 +0000 2017,480673809,,,,0,...,,,,1,0,158587,540686,1489039000000.0,1515.0,6763000.0


In [18]:

test = 87
check_min = min(data[(data["new_event_id"] == 0) & (data["partition"] == test)]["cleaned_created_at"].values)
check_max = max(data[(data["new_event_id"] == 0) & (data["partition"] == (test-1))]["cleaned_created_at"].values)
print(check_min - check_max)
data[(data["new_event_id"] == 0) & (data["partition"] == test)][["cleaned_created_at", "delta_t"]].head()

2997000.0


Unnamed: 0,cleaned_created_at,delta_t
540080,1427331000000.0,2997000.0
541950,1427320000000.0,2997000.0
542712,1427338000000.0,2997000.0
545676,1427325000000.0,2997000.0
546972,1427332000000.0,2997000.0


# Extract user - article matrix

## Extracting SVD decomposition for each user

In [19]:
import scipy.sparse as sps
from scipy import sparse
arr_articles = data["new_event_id"].values.astype(np.int)
arr_users = data["new_user_id"].values.astype(np.int)
ones = np.ones(len(arr_users))
matrix = sps.coo_matrix((ones, (arr_users, arr_articles)))

In [20]:
from sklearn.decomposition import TruncatedSVD

matrix = matrix.tocsr()

N_SVD_FEATURES = 20

svd = TruncatedSVD(n_components=N_SVD_FEATURES, n_iter=6, random_state=42)
svd_matrix = svd.fit_transform(matrix)

In [21]:
#GENERATE TWEET FEATURES DATA FRAME
list_f = []
for i in range(svd_matrix.shape[0]):
    list_f += [[i] + list(svd_matrix[i,:])]
    
svd_features = pd.DataFrame(list_f, columns=["to_drop"]+list(range(svd_matrix.shape[1])))

#JOIN
data_with_svd = data.merge(svd_features, left_on='new_user_id', right_on='to_drop')
data_with_svd.drop(['to_drop'], inplace = True, axis=1)
#data_with_svd.rename(index=str, columns={"id_x": "id"}, inplace = True)
data_with_svd.head(2)

Unnamed: 0,id,full_text,formatted_text,event_id,created_at,user_id,in_reply_to_status_id,hash_tags,user_mentions,retweet_count,...,10,11,12,13,14,15,16,17,18,19
0,615868085090676737,"Walmart bakes ISIS cake, rejects Confederate f...","Walmart bakes ISIS cake, rejects Confederate f...",E695,Tue Jun 30 13:02:48 +0000 2015,1643045892,,,,0,...,-0.521783,-0.004987,-2.108909,1.558319,-0.096077,-0.436899,0.528864,0.037603,0.056805,0.108205
1,662248078741671936,WAR ON CHRISTMAS: STARBUCKS RED CUPS ARE EMBLE...,WAR ON CHRISTMAS: STARBUCKS RED CUPS ARE EMBLE...,E200,Thu Nov 05 12:40:20 +0000 2015,1643045892,,,,0,...,-0.521783,-0.004987,-2.108909,1.558319,-0.096077,-0.436899,0.528864,0.037603,0.056805,0.108205


# Extract text features

## Removing dots etc...

In [22]:
txt = data["formatted_text"].values

txt[pd.isnull(txt)] = "" # remove NaN

from string import punctuation
def clean_text(arr_text):
    for i in range(len(arr_text)):
        tweet = arr_text[i]
        tweet = tweet.lower()
        tweet = tweet.translate(str.maketrans('', '', punctuation))
        arr_text[i] = tweet
        
clean_text(txt)


## Generate labels and input vectors

In [23]:

labels = data["new_id"].values
source = []
for i in range(len(txt)):
    source+=[TaggedDocument(words=txt[i].split(), tags=[labels[i]])]

## Build & train Doc2Vec

In [24]:
N_DOC2VEC_FEATURES = 40

model = Doc2Vec(min_count=0, window=7, alpha=0.025, vector_size=N_DOC2VEC_FEATURES, sample=1e-6, negative=3, workers=8) # vector_size: max 100
model.build_vocab(source)
model.train(source, total_examples=len(source), epochs=30)


### Test the model

In [25]:
model.docvecs[580897]#questa è la label!
#len(txt)

array([-0.05161712, -0.01940884, -0.05736948,  0.03357461,  0.05306949,
        0.0395308 ,  0.07221778,  0.02495591,  0.03662624,  0.00480322,
        0.04975189,  0.0147081 , -0.1440455 ,  0.04128249,  0.11933533,
        0.10271724,  0.03745944,  0.05100215,  0.00195038,  0.00053151,
        0.0480383 , -0.05257616, -0.06249733,  0.05969211,  0.05679905,
        0.1021726 , -0.00794677, -0.01027787,  0.00898543,  0.00906034,
       -0.04024748,  0.00313335,  0.0083389 ,  0.02280129, -0.02162123,
        0.03317396,  0.0423104 , -0.04267846,  0.04766091, -0.00706453],
      dtype=float32)

In [26]:
model.most_similar('islam')

  if __name__ == '__main__':


[('tennessee', 0.9555593729019165),
 ('middle', 0.9325387477874756),
 ('forcing', 0.9306639432907104),
 ('nonmuslim', 0.9230748414993286),
 ('“allah', 0.9220364093780518),
 ('‘millions’', 0.9207289814949036),
 ('reject', 0.9197089076042175),
 ('lesssons', 0.9188733100891113),
 ('darn', 0.9136744737625122),
 ('districts', 0.9113842248916626)]

### Save (and load the Doc2Vec model)

In [242]:
#model.save('./model_50_doc2vec.d2v')
#model = Doc2Vec.load('./model_doc2vec.d2v')

## Generate table text features

In [27]:
#GENERATE TWEET FEATURES DATA FRAME
list_text_features = []
for i in labels:
    indexed_arr = [i]+list(model.docvecs[i])
    list_text_features+=[indexed_arr]
    
#n_features = len(list_text_features[0])-1 #escludo gli id
feature_text_tweet = pd.DataFrame(list_text_features, columns=["id_to_delete"]+list(range(N_DOC2VEC_FEATURES)))

#JOIN
new_data = data.merge(feature_text_tweet, left_on='new_id', right_on='id_to_delete')

new_data.drop(['id_to_delete'], inplace = True, axis=1)
#new_data.rename(index=str, columns={"id_x": "id"}, inplace = True)

## Join Text features with SVD features

In [28]:
data_with_svd.head(2)

Unnamed: 0,id,full_text,formatted_text,event_id,created_at,user_id,in_reply_to_status_id,hash_tags,user_mentions,retweet_count,...,10,11,12,13,14,15,16,17,18,19
0,615868085090676737,"Walmart bakes ISIS cake, rejects Confederate f...","Walmart bakes ISIS cake, rejects Confederate f...",E695,Tue Jun 30 13:02:48 +0000 2015,1643045892,,,,0,...,-0.521783,-0.004987,-2.108909,1.558319,-0.096077,-0.436899,0.528864,0.037603,0.056805,0.108205
1,662248078741671936,WAR ON CHRISTMAS: STARBUCKS RED CUPS ARE EMBLE...,WAR ON CHRISTMAS: STARBUCKS RED CUPS ARE EMBLE...,E200,Thu Nov 05 12:40:20 +0000 2015,1643045892,,,,0,...,-0.521783,-0.004987,-2.108909,1.558319,-0.096077,-0.436899,0.528864,0.037603,0.056805,0.108205


In [29]:
new_data.head(2)

Unnamed: 0,id,full_text,formatted_text,event_id,created_at,user_id,in_reply_to_status_id,hash_tags,user_mentions,retweet_count,...,30,31,32,33,34,35,36,37,38,39
0,615868085090676737,"Walmart bakes ISIS cake, rejects Confederate f...",walmart bakes isis cake rejects confederate fl...,E695,Tue Jun 30 13:02:48 +0000 2015,1643045892,,,,0,...,0.013246,0.000468,0.007839,-0.00962,-0.009944,-0.01263,0.017422,-0.010738,0.009483,-0.009829
1,14619605813,UK bans doctor who linked autism to vaccine: B...,uk bans doctor who linked autism to vaccine br...,TM859,Mon May 24 12:23:36 +0000 2010,124779476,,,,0,...,-0.014981,0.00957,0.009814,0.049001,0.002264,0.014578,0.009482,-0.029581,0.02112,-0.006133


In [30]:
#new_data = data.merge(feature_text_tweet, left_on='new_event_id', right_on='id_to_delete')
part = data.merge(data_with_svd[["new_id"] + list(range(N_SVD_FEATURES))], left_on='new_id', right_on='new_id')


new_feature_indexes = {}
f = []
for i in range (N_DOC2VEC_FEATURES):
    v = N_SVD_FEATURES + i
    new_feature_indexes[i]= v
    f+=[v]

new_data.rename(index=str, columns=new_feature_indexes, inplace = True)



final_table = part.merge(new_data[["new_id"] + f], left_on='new_id', right_on='new_id')
#new_data[list(range(N_DOC2VEC_FEATURES))]

In [31]:
final_table.head(3)

Unnamed: 0,id,full_text,formatted_text,event_id,created_at,user_id,in_reply_to_status_id,hash_tags,user_mentions,retweet_count,...,50,51,52,53,54,55,56,57,58,59
0,615868085090676737,"Walmart bakes ISIS cake, rejects Confederate f...",walmart bakes isis cake rejects confederate fl...,E695,Tue Jun 30 13:02:48 +0000 2015,1643045892,,,,0,...,0.013246,0.000468,0.007839,-0.00962,-0.009944,-0.01263,0.017422,-0.010738,0.009483,-0.009829
1,14619605813,UK bans doctor who linked autism to vaccine: B...,uk bans doctor who linked autism to vaccine br...,TM859,Mon May 24 12:23:36 +0000 2010,124779476,,,,0,...,-0.014981,0.00957,0.009814,0.049001,0.002264,0.014578,0.009482,-0.029581,0.02112,-0.006133
2,644875384622829568,Doritos unveils rainbow chips to support LGBT ...,doritos unveils rainbow chips to support lgbt ...,E391,Fri Sep 18 14:07:27 +0000 2015,22970986,,,,3,...,-0.00346,0.019798,-0.006504,0.010938,-0.002371,0.002103,0.036783,-0.018316,0.021588,0.010071


# Generate tables with features

In [32]:
partiton_count = final_table.groupby(["new_event_id", "partition"])["new_id"].agg(['count']).reset_index()
delta_t = final_table[["new_event_id", "partition", "delta_t"]].drop_duplicates(["new_event_id", "partition"])

text_f = final_table.groupby(["new_event_id", "partition"])[list(range(N_DOC2VEC_FEATURES+N_SVD_FEATURES))].agg(['mean']).stack().reset_index()

In [33]:
partiton_count.head(3)

Unnamed: 0,new_event_id,partition,count
0,0,0.0,1
1,0,10.0,1
2,0,12.0,1


In [34]:
delta_t.head(3)

Unnamed: 0,new_event_id,partition,delta_t
0,947,5.0,123000.0
1,1273,0.0,0.0
2,623,2.0,67000.0


In [35]:
text_f.head(3)

Unnamed: 0,new_event_id,partition,level_2,0,1,2,3,4,5,6,...,50,51,52,53,54,55,56,57,58,59
0,0,0.0,mean,3e-06,6.415345e-10,3.1e-05,1.7e-05,9.1e-05,3e-06,-0.000189,...,-0.010517,0.01449,0.034262,0.061763,-0.013158,0.036686,0.035259,-0.04588,0.029216,-0.002999
1,0,10.0,mean,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,-9.4e-05,...,-0.024901,-0.007499,0.030719,0.062305,-0.020796,0.062861,0.063686,-0.056603,0.067264,-0.00363
2,0,12.0,mean,0.000312,1.936857e-06,0.012494,0.002311,0.055596,0.006319,-0.224932,...,-0.023305,0.013901,0.028019,0.038019,-0.008867,0.039793,0.040575,-0.027853,0.045434,0.00602


# Generate merged feature table

In [36]:
final_features = partiton_count.merge(delta_t, left_on=["new_event_id", "partition"], right_on=["new_event_id", "partition"]).merge(text_f, left_on=["new_event_id", "partition"], right_on=["new_event_id", "partition"])
final_features = final_features.drop("level_2", 1)

In [37]:
final_features[final_features["new_event_id"] == 0].head(5)

Unnamed: 0,new_event_id,partition,count,delta_t,0,1,2,3,4,5,...,50,51,52,53,54,55,56,57,58,59
0,0,0.0,1,0.0,3e-06,6.415345e-10,3.1e-05,1.7e-05,9.1e-05,3e-06,...,-0.010517,0.01449,0.034262,0.061763,-0.013158,0.036686,0.035259,-0.04588,0.029216,-0.002999
1,0,10.0,1,467729000.0,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,...,-0.024901,-0.007499,0.030719,0.062305,-0.020796,0.062861,0.063686,-0.056603,0.067264,-0.00363
2,0,12.0,1,84167000.0,0.000312,1.936857e-06,0.012494,0.002311,0.055596,0.006319,...,-0.023305,0.013901,0.028019,0.038019,-0.008867,0.039793,0.040575,-0.027853,0.045434,0.00602
3,0,63.0,5,2182941000.0,0.000642,3.669964e-06,0.019484,0.003343,0.046411,0.002006,...,0.000476,-0.000995,0.005039,0.003005,0.005104,0.000929,0.005274,0.000862,0.005597,-2.9e-05
4,0,66.0,1,107492000.0,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,...,-0.025605,0.001903,0.041327,0.070999,-0.028222,0.062059,0.042619,-0.085313,0.054266,-0.00188


## re-mapping event_id

### re-mapping partitions

In [38]:
def clean_partitions(x):
    arr = np.unique(x.values)
    res = []
    for itm in x.values:
        res+=[np.searchsorted(arr, itm)]
    return res

final_features["new_partition"] = final_features.groupby("new_event_id")["partition"].transform(lambda x: clean_partitions(x))

In [41]:
final_features[final_features["new_event_id"] == 0].head(10)

Unnamed: 0,new_event_id,partition,count,delta_t,0,1,2,3,4,5,...,51,52,53,54,55,56,57,58,59,new_partition
0,0,0.0,1,0.0,3e-06,6.415345e-10,3.1e-05,1.7e-05,9.1e-05,3e-06,...,0.01449,0.034262,0.061763,-0.013158,0.036686,0.035259,-0.04588,0.029216,-0.002999,0.0
1,0,10.0,1,467729000.0,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,...,-0.007499,0.030719,0.062305,-0.020796,0.062861,0.063686,-0.056603,0.067264,-0.00363,1.0
2,0,12.0,1,84167000.0,0.000312,1.936857e-06,0.012494,0.002311,0.055596,0.006319,...,0.013901,0.028019,0.038019,-0.008867,0.039793,0.040575,-0.027853,0.045434,0.00602,2.0
3,0,63.0,5,2182941000.0,0.000642,3.669964e-06,0.019484,0.003343,0.046411,0.002006,...,-0.000995,0.005039,0.003005,0.005104,0.000929,0.005274,0.000862,0.005597,-2.9e-05,3.0
4,0,66.0,1,107492000.0,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,...,0.001903,0.041327,0.070999,-0.028222,0.062059,0.042619,-0.085313,0.054266,-0.00188,4.0
5,0,69.0,2,139282000.0,0.001784,1.166781e-05,0.054881,0.011126,0.12575,0.005443,...,0.00739,0.031642,0.055756,-0.012291,0.045306,0.05909,-0.05599,0.047343,-0.00453,5.0
6,0,70.0,2,42354000.0,2e-06,4.811508e-10,2.3e-05,1.3e-05,6.8e-05,3e-06,...,-0.004336,0.012216,0.046791,-0.010252,0.034467,0.027534,-0.042117,0.032903,0.003897,6.0
7,0,71.0,2,18022000.0,4e-06,9.990257e-10,5.7e-05,8e-06,0.000124,5e-06,...,0.002392,0.029482,0.064102,-0.023356,0.045102,0.049197,-0.066829,0.052275,-0.001645,7.0
8,0,72.0,1,57283000.0,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,...,0.003742,0.000933,-0.006202,0.005991,-0.011286,-0.004891,0.005692,-0.002777,-0.011256,8.0
9,0,86.0,2,578918000.0,2e-06,3.207672e-10,1.6e-05,8e-06,4.5e-05,2e-06,...,0.011794,0.028604,0.059143,-0.010099,0.043495,0.030523,-0.062917,0.043583,-0.013744,9.0


## getting n.steps

In [42]:
n_steps = int(max(final_features["new_partition"]))+1
n_steps

1609

# Now generate input matrix for RNN.

## This matrix will be X matrix (samples = #events, features = SVD features + text_features + 2, input_steps)

In [43]:
x_matrix = np.zeros((max(final_features["new_event_id"].values)+1, N_DOC2VEC_FEATURES+N_SVD_FEATURES+2 , n_steps))

In [44]:
x_matrix.shape

(1296, 62, 1609)

In [45]:
#test = final_features[final_features["new_article_id"] == 0][["new_partition", "count"]] # partition, count
final_features["new_partition"] = final_features["new_partition"].astype(int)
#print(test.head())
x_matrix[final_features["new_event_id"].values, 0, final_features["new_partition"].values] = final_features["count"].values # sample 0, feature 0 = count x partition
x_matrix[final_features["new_event_id"].values, 1, final_features["new_partition"].values] = final_features["delta_t"].values

for i in range (N_DOC2VEC_FEATURES+N_SVD_FEATURES):
    #print(i)
    x_matrix[final_features["new_event_id"].values, int(i+2), final_features["new_partition"].values] = final_features[i].values



In [89]:
np.save("x_matrix_new.npy", x_matrix)

# Generate y matrix

In [46]:
y_table = data[["new_event_id", "is_fake"]].drop_duplicates()

In [47]:
y_matrix = np.zeros((max(y_table["new_event_id"].values)+1, 1))

In [48]:
y_matrix[y_table["new_event_id"].values, 0] = y_table["is_fake"].values

In [93]:
np.save("y_matrix_new.npy", y_matrix)

In [49]:
y_matrix.shape[0]

1296

# Split train-test

In [52]:
from sklearn.model_selection import KFold
row_index = np.arange(0, y_matrix.shape[0])

n_splits = int(y_matrix.shape[0] // (y_matrix.shape[0]*0.2)) + 2
n_splits

7

In [51]:
kf = KFold(n_splits=n_splits, shuffle=True)
i = 0
for train_indexes, test_indexes in kf.split(row_index):
    x_train = x_matrix[train_indexes,:,:]
    x_test = x_matrix[test_indexes,:,:]
    y_train = y_matrix[train_indexes,:]
    y_test = y_matrix[test_indexes,:]
    
    np.save("final_datasets/X_train_"+str(i)+".npy", x_train)
    np.save("final_datasets/X_test_"+str(i)+".npy", x_test)
    np.save("final_datasets/y_train_"+str(i)+".npy", y_train)
    np.save("final_datasets/y_test_"+str(i)+".npy", y_test)
    i+=1

In [131]:

for train_indexes, test_indexes in kf.split(row_index):
    x_train = x_matrix[train_indexes,:,:]
    print(x_train.shape)
    print(y_test.shape)

(1036, 52, 904)
(259, 1)
(1037, 52, 904)
(259, 1)
(1037, 52, 904)
(259, 1)
(1037, 52, 904)
(259, 1)
(1037, 52, 904)
(259, 1)
