<center>
<h1>Fake news detection using graph neural network</h1>
<h2>Final resume</h2>
<i><h6>Victor GOUBET | Nicholas Tagliapietra | Asma Bakhtiariazad</h6></i>
</center>


----

TO DO: add tsne ? Add resume plot for auc for the 9 models (bar plot or something like that) ?

think about other plots and way to compare models. 

Maybe we can think about other scores (AUC, test accuracy, ...)thus for each model we have n scores and we use this plot:
https://plotly.com/python/parallel-categories-diagram/ with each score type being a column (sorted) and the edge a model

# Library

In [8]:
import os
import torch
import numpy as np
import pickle as pk
import pandas as pd
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go


from tool_box.upfd_dataset import ext_UPFD
from tool_box.GNN_train import test_accuracy
from tool_box.models import Basic_GNN, GAT, SAGE
from sklearn.metrics import roc_curve, roc_auc_score

%matplotlib inline

# Import models and dataset

In [2]:
folders = os.listdir('../models')
models = {}
for folder in folders:
    models[folder] = {}
    for f in os.listdir(f'../models/{folder}/'):
        models[folder][f] = torch.load(f'../models/{folder}/{f}')

In [3]:
datasets = pk.load(open('../data/datasets.pkl', 'rb'))
datasets = [datasets[1], datasets[2], datasets[0]]

# Define evaluators

In [10]:
def roc_plot(models, names):
    
    fig = sp.make_subplots(rows = 1, cols = 2,
                           subplot_titles=("TPR and FPR at every threshold", 'ROC Curve'))
    
    fig.add_shape(
            type='line', line=dict(dash='dash'),
            x0=0, x1=1, y0=0, y1=1, row=1, col=2
        )
    scores = {'auc':[], 'test_acc':[], 'tpr_0.5':[], 'archi':[], 'dataset':[]}
    colors = ['red', 'orange', 'blue']
    for i in range(len(models)):
        
        test_acc, y_pred, y_real = test_accuracy(models[i], datasets[i][3])
        y_pred = np.exp(y_pred)[:, -1]
        
        fpr, tpr, thresholds = roc_curve(y_real, y_pred)
        auc_score = roc_auc_score(y_real, y_pred)
        name = names[i].split('_')[-1]
        
        scores['auc'].append(auc_score)
        scores['test_acc'].append(test_acc)
        scores['tpr_0.5'].append(tpr[len(tpr)//2])
        scores['archi'].append(''.join(names[i].split('_')[:-1]))
        scores['dataset'].append(name)
            
        fig.add_trace(go.Scatter(x=thresholds, y=tpr, name=f'{name} | True pos',
                                 line = dict(color=colors[i], width=3, dash='dash')), 
                      row=1, col=1)
        
        fig.add_trace(go.Scatter(x=thresholds, y=fpr, name=f'{name} | False pos',
                                 line = dict(color=colors[i], width=3, dash='dot')),
                      row=1, col=1)
        
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=f"{name} | AUC={auc_score:.2f}", 
                                 line = dict(color=colors[i], width=3)),
                      row=1, col=2)
        
    fig.update_xaxes(range=[0, 1], constrain='domain')
    fig.update_yaxes(range=[0, 1], constrain='domain', scaleanchor="x", scaleratio=1)
    
    fig.update_xaxes(title_text='False Positive Rate', row=1, col=2)
    fig.update_yaxes(title_text='True Positive Rate', row=1, col=2)
    
    fig.update_xaxes(title_text='Thresholds', row=1, col=1)
    fig.update_yaxes(title_text='False Positive Rate', row=1, col=1)

    title = ''.join(names[0].split('_')[:-1])
    fig.update_layout(width=900, height=500, title_text=title)
    fig.show()
    return pd.DataFrame.from_dict(scores)

In [11]:
def box_plot(df):
    fig = sp.make_subplots(rows=1, cols=3)
    score_name = ['auc', 'test_acc', 'tpr_0.5']
    dts_uni = df['dataset'].unique()
    
    for i, sn in enumerate(score_name):
        y = [df[df['dataset'] == d][sn].to_numpy() for d in dts_uni]
        fig.add_trace(go.Box(y=y, name=sn), row=1, col=i+1)
        fig.update_xaxes(title_text=' - '.join(dts_uni), row=1, col=i+1)
        fig.update_traces(q1=np.quantile(y, 0.25, axis=1), median=np.median(y, axis=1),
                          q3=np.quantile(y, 0.75, axis=1), row=1, col=i+1)
        
    fig.update_layout(title_text='Box plot of scores grouping by dataset')
    fig.show()

In [14]:
def bar_plot(df):
    fig = px.bar(df, x="model", y=["auc", "test_acc", "tpr_0.5"], title="Bar plot of AUC | Test acc | True positive rate")
    fig.show()

# Plot results

In [15]:
df = pd.DataFrame()
for i, (name, model) in enumerate(models.items()):
    scores = roc_plot(list(model.values()), list(model.keys()))
    df = scores if df.empty else pd.concat([df, scores])
    
df['model'] = df['archi'] + df['dataset']

bar_plot(df)
box_plot(df)