# **Fact checking, Neural Languange Inference (NLI)**

**Authors**: Giacomo Berselli, Marco Cucè, Riccardo De Matteo

In [None]:
# to print all output for a cell instead of only last one 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### 1. Libraries and Imports 

In [None]:
import os
import requests
import zipfile
import random

import torch

import numpy as np
import pandas as pd

import gensim
import gensim.downloader as gloader

import time 

# Fix data seed to achieve reproducible results
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
print("Current work directory: {}".format(os.getcwd())) #print the current working directory 

data_folder = os.path.join(os.getcwd(),"data") # directory containing the notebook

if not os.path.exists(data_folder):   #create folder where all data will be stored 
    os.makedirs(data_folder)

### 2. Data handling

First thing first, we download the raw dataset, unzip it and store the csv document of each split in the dataset folder 

In [None]:
raw_dataset_path = os.path.join(data_folder,'raw_dataset')   #path of the raw dataset as downloaded 

def save_response_content(response, destination):    
    CHUNK_SIZE =32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks                
                f.write(chunk)

def download_data(data_folder):
    zip_dataset_path = os.path.join(raw_dataset_path,'fever_data.zip')    
    data_url_id ="1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"    
    url ="https://docs.google.com/uc?export=download"

    if not os.path.exists(raw_dataset_path):        
        os.makedirs(raw_dataset_path)

    if not os.path.exists(zip_dataset_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:           
            response = current_session.get(url, params={'id': data_url_id}, stream=True)

        save_response_content(response, zip_dataset_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(zip_dataset_path) as loaded_zip:            
            loaded_zip.extractall(raw_dataset_path)
        print("Extraction completed!")

download_data(data_folder)

Now that we have the csv files of the train, val and test splits, we encode all three as a unique pandas Dataframe to be able to better inspect it and manipulate it as a whole.
The Dataframe `df` is structured as follows: 
- `claim`: the fact to verify 
- `evidence`: one of the possibly multiple sentences in the dataset which supports or refutes the `claim`
- `id`: number associated to the fact to verify (different rows can have the same `id`)
- `label`: wether the evidence REFUTES or SUPPORTS the claim
- `split`: the split to which one claim belongs (train, val, test)


In [None]:
#encode the entire dataset in a pandas dataframe and add the split column
def encode_dataset(): 

    df = pd.DataFrame()
    for split in ['train','val','test']:
        split_path = os.path.join(raw_dataset_path,f"{split}_pairs.csv")
        split_df = pd.read_csv(split_path,index_col=0)
        split_df['split'] = split

        df = df.append(split_df,ignore_index=True,)

    df.columns= df.columns.str.lower()

    return df 

df = encode_dataset()

In [None]:
df.head()

In [None]:
print('Dataframe shape:', df.shape)

In [None]:
len(df[df['split']=='train'])
len(df[df['split']=='val'])
len(df[df['split']=='test'])

In [None]:
df['split'].unique()
df['label'].unique()
df['id'].nunique()
len(df)