# CREATE ALL_DATA AND GRAPHS

**Aim :** Create the all_data that are the DataFrame containing the information on one day, and the graph corresponding.

# LIBRARIES

In [5]:
import os
import os.path
os.chdir("C:/Users/maell/Fake_News_Project")

import urllib.request
import zipfile

import pandas as pd
import numpy as np
import community

from scipy import *

from itertools import combinations
from collections import Counter

import matplotlib.pyplot as plt

import networkx as nx

# FUNCTIONS

In [2]:
def open_data(fichier):
    data=pd.read_csv(fichier,sep="\t",index_col=0)
    return data
##############################################################################################################################

def filtrer(src, dst, date):
    date = [date]
    for line in src:
        if 'mentions.CSV' not in line:
            continue
        arr = line.split(' ')
        for time in date:
            if time in arr[2]:
                dst.write(arr[0]+' '+arr[1]+' '+arr[2])

##############################################################################################################################

def dezip(filezip, pathdst = ''): 
    if pathdst == '': pathdst = os.getcwd()  ## on dezippe dans le repertoire locale 
    zfile = zipfile.ZipFile(filezip, 'r') 
    for i in zfile.namelist():  ## On parcourt l'ensemble des fichiers de l'archive 
        #print (i) 
        if os.path.isdir(i):   ## S'il s'agit d'un repertoire, on se contente de creer le dossier 
            try: os.makedirs(pathdst + os.sep + i) 
            except: pass 
        else: 
            try: os.makedirs(pathdst + os.sep + os.path.dirname(i)) 
            except: pass 
            data = zfile.read(i)                   ## lecture du fichier compresse 
            fp = open(pathdst + os.sep + i, "wb")  ## creation en local du nouveau fichier 
            fp.write(data)                         ## ajout des donnees du fichier compresse dans le fichier local 
            fp.close() 
    zfile.close()

##############################################################################################################################

def creation_fichier_csv(date):

    # Création du fichier contenant uniquement les liens des fichier mentions qui nous interressent
    source = open("masterfilelist.txt", "r") # Ouverture du fichier source
    destination = open("data_list.csv", "w") # Ouverture du fichier destination
    filtrer(source, destination, date) # Filtrage
    destination.close()# Fermeture du fichier destination
    source.close()# Fermerture du fichier source

    #enregistrement fichier csv
    master_data = pd.read_csv('data_list.csv',sep= ' ',header = None,engine='python')
    master_data.columns = ['1','2','url']

    #suppression lignes inutiles
    for index, row in master_data.iterrows():
        if '.mentions.CSV' not in row['url']:
            master_data = master_data.drop(index)

    # vide le dossier où stocker les donnees
    for element in os.listdir("C:/Users/maell/Fake_News_Project/data_folder"):
        path="C:/Users/maell/Fake_News_Project/data_folder/"+element
        os.remove(path)

    #telecharge et dezippe les fichiers depuis le web
    for index, row in master_data.iterrows():
        element = row['url']
        urllib.request.urlretrieve(element,'file')
        dezip('file', 'data_folder')
    print(date+' dezip OK')

    # Création du DataFrame Complet
    column_names=['GlobalEventID','EventTimeDate','MentionTimeDate','MentionType','MentionSourceName','MentionIdentifier','SentenceID','Actor1CharOffset','Actor2CharOffset','ActionCharOffset','InRawText','Confidence','MentionDocLen','MentionDocTone','MentionDocTranslationInfo','Extras']
    all_data=pd.DataFrame({'GlobalEventID':[],'EventTimeDate':[],'MentionTimeDate':[],'MentionType':[],'MentionSourceName':[],'MentionIdentifier':[],'SentenceID':[],'Actor1CharOffset':[],'Actor2CharOffset':[],'ActionCharOffset':[],'InRawText':[],'Confidence':[],'MentionDocLen':[],'MentionDocTone':[],'MentionDocTranslationInfo':[],'Extras':[]},columns = column_names, index=[])

    for fichier in os.listdir("C:/Users/maell/Fake_News_Project/data_folder"):
        path_fichier="C:/Users/maell/Fake_News_Project/data_folder/"+fichier
        new_data=open_data(path_fichier)
        all_data = pd.concat([all_data,new_data],ignore_index = True)

    # Sauvegarde du fichier all_data.csv
    nom_fichier='all_data/all_data_'+date+'.csv'
    all_data.to_csv(nom_fichier, sep='\t', decimal= '.')
    print(nom_fichier+' enregistré')

In [10]:
list_date=['20180623', '20180624', '20180625', '20180626', '20180627', '20180628', '20180629']
for date in list_date:
    creation_fichier_csv(date)

20180623 dezip OK
all_data/all_data_20180623.csv enregistré
20180624 dezip OK
all_data/all_data_20180624.csv enregistré
20180625 dezip OK
all_data/all_data_20180625.csv enregistré
20180626 dezip OK
all_data/all_data_20180626.csv enregistré
20180627 dezip OK
all_data/all_data_20180627.csv enregistré
20180628 dezip OK
all_data/all_data_20180628.csv enregistré
20180629 dezip OK
all_data/all_data_20180629.csv enregistré


# CREATE ALL DATA FOR 1 WEEK

In [11]:
def clean_dataFrame(df_all_data):
    all_data_reduit=df_all_data.drop_duplicates({'MentionTimeDate','GlobalEventID','MentionSourceName'})
    df_all_data=all_data_reduit.dropna(subset=['MentionSourceName'])
    return df_all_data

In [38]:
list_date=['20170930','20170929']
#['20180623', '20180624', '20180625', '20180626', '20180627', '20180628', '20180629','20180630']

df_all_data=pd.DataFrame()
df_republication=pd.DataFrame(columns=['WebsiteURL','Republication'])
for date in list_date:
    path='all_data/all_data_'+date+'.csv'
    df_all_data=df_all_data.append(open_data(path))
        
df_all_data=clean_dataFrame(df_all_data)
df_all_data.to_csv('2_days_all_data_20170929_20170930', sep='\t', decimal= '.')

# CREATE GRAPH

In [1]:
def graph_sources_poids(all_data_path):
    # Create graph where nodes are websites and edges reprensent the number of event shared between the two websites.
    data=open_data(all_data_path)
    G=nx.Graph()
    for source in data.MentionSourceName.unique():
        G.add_node(source)
    for event in data.GlobalEventID.unique():
        liste_source=data[data.GlobalEventID==event].MentionSourceName
        for comb in combinations(liste_source,2):
            if comb[0]!=comb[1]:
                if G.has_edge(comb[0],comb[1]):
                    G[comb[0]][comb[1]]['weight'] += 1
                else:
                    G.add_edge(comb[0], comb[1], weight=1)
    return G


     
#######################################################################################################
    
def cleaning_graph(H):
    G=nx.Graph.copy(H)
    
    # Suppression des nan
    list_source=list(G.nodes)
    for source in list_source:
        if type(source) is not str:
            G.remove_node(source)
    
    # Remove edges not weigted enough
    edges_to_remove = []
    for edge in G.edges():
        if G[edge[0]][edge[1]]['weight']<5:
            edges_to_remove.append(edge)
    if(edges_to_remove!=[]):
        G.remove_edges_from(edges_to_remove)

    # Suppression des éléments non connectés
    composants=list(nx.connected_components(G))
    i=0;
    while i<len(composants):
        if len(composants[i])==1:
            composant_supp=list(composants[i])[0]
            G.remove_node(composant_supp)
        i=i+1     
    return G

In [6]:
all_data='2_days_all_data_20170929_20170930'
G=graph_sources_poids(all_data)
print('Ready to clean')
G_clean=cleaning_graph(G);
nom_graph='graph_20170929_20170930.gexf'
nx.write_gexf(G_clean,nom_graph)
print('Graph Created')

Ready to clean
Graph Created
