# **Evaluate fitted model**

### **Imports**

In [1]:
from fact_ver import extract_embedding
from eval_mod import *
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import classification_report

## **Load data**

In [2]:
generated_statements = pd.read_csv("generated_statements.csv")
generated_statements.head()

Unnamed: 0,Title,Article,Summary,Statement,Label
0,USS SC-40,"USS SC-40, until July 1920 known as USS Submar...","USS SC-40, until July 1920 known as USS Submar...",USS SC-40 was a wooden-hulled submarine chaser.,1
1,Valentina Zenere,Valentina Zenere (born 15 January 1997) is an...,Valentina Zenere (born 15 January 1997) is an...,Valentina Zenere plays Isadora Artiñán on Elite.,1
2,From M.E. to Myself,From M.E. to Myself (simplified Chinese: 和自己对话...,From M.E. to Myself (simplified Chinese: 和自己对话...,Golden Melody Awards for Best Mandarin Male Si...,1
3,Charalampos Papaioannou,"Charalampos Papaioannou (born January 4, 1971)...","Charalampos Papaioannou (born January 4, 1971)...",Charalampos Papaioannou is a Greek judoka.,1
4,"First United Methodist Church (Aberdeen, South...",Aberdeen First United Methodist Church is a hi...,Aberdeen First United Methodist Church is a hi...,The church features an open lantern atop the r...,1


## **Split data**
Same as when traning the model

In [3]:
train_data, val_test_data = train_test_split(generated_statements, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=1/3, random_state=42)
print(f"-> Train shape: {train_data.shape}")
print(f"-> Val shape:   {val_data.shape}")
print(f"-> Test shape:  {test_data.shape}")

-> Train shape: (1400, 5)
-> Val shape:   (400, 5)
-> Test shape:  (200, 5)


## **Load embeddings**
Same as when training the model

In [4]:
evidence_train = extract_embedding(train_data["Summary"], 127).numpy()
evidence_val = extract_embedding(val_data["Summary"], 127).numpy()
evidence_test = extract_embedding(test_data["Summary"], 127).numpy()

claim_train = extract_embedding(train_data["Statement"], 27).numpy()
claim_val = extract_embedding(val_data["Statement"], 27).numpy()
claim_test = extract_embedding(test_data["Statement"], 27).numpy()

y_train = train_data["Label"]
y_val = val_data["Label"]
y_test = test_data["Label"]

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/116M [00:00<?, ?B/s]

## **Load model**

In [5]:
model = load_model("FactVerModel")

## **Get classification report of each dataset**

In [6]:
train_predictions = model.predict([claim_train, evidence_train], verbose=0)
y_train_pred = (train_predictions > 0.5).astype(int)
train_report = classification_report(y_train, y_train_pred)
print(train_report)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       702
           1       0.97      0.95      0.96       698

    accuracy                           0.96      1400
   macro avg       0.96      0.96      0.96      1400
weighted avg       0.96      0.96      0.96      1400



In [7]:
test_predictions = model.predict([claim_test, evidence_test], verbose=0)
y_test_pred = (test_predictions > 0.5).astype(int)
test_report = classification_report(y_test, y_test_pred)
print(test_report)

              precision    recall  f1-score   support

           0       0.90      0.80      0.85       106
           1       0.80      0.90      0.85        94

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.86      0.85      0.85       200



In [8]:
val_predictions = model.predict([claim_val, evidence_val], verbose=0)
y_val_pred = (val_predictions > 0.5).astype(int)
val_report = classification_report(y_val, y_val_pred)
print(val_report)

              precision    recall  f1-score   support

           0       0.80      0.79      0.79       192
           1       0.81      0.82      0.81       208

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



## **Use predictions to find false negatives and false positives**

In [9]:
test_data["Predictions"] = y_test_pred
test_data["Model output"] = test_predictions

test_data["False Positive"] = 0
test_data["False Negative"] = 0
test_data["Correctly Predicted"] = 0

test_data.loc[(test_data["Label"] == 1) & (test_data["Predictions"] == 0), "False Positive"] = 1
test_data.loc[(test_data["Label"] == 0) & (test_data["Predictions"] == 1), "False Negative"] = 1
test_data.loc[test_data["Label"] == test_data["Predictions"], "Correctly Predicted"] = 1

In [10]:
print(f"False Negative count: {sum(test_data['False Negative'])}")
print(f"False Positive count: {sum(test_data['False Positive'])}")

False Negative count: 21
False Positive count: 9


In [11]:
fn = test_data[test_data['False Negative']==1]
fp = test_data[test_data['False Positive']==1]

## **Manually look at the incorrectly classified samples in the test data**

In [12]:
for i, row in fp.iterrows():
    print(f"_________ {row['Title']} _________")
    print(f" -> Statement: {row['Statement']}")
    print(f" -> Model output: {row['Model output']} (Actual={row['Label']})")
    print(f" -> Summary: '{row['Summary']}'")
    print("\n")
    

_________ Slingsby Falcon _________
 -> Statement: The Falcon was a single-seat sport glider.
 -> Model output: 0.23785646259784698 (Actual=1)
 -> Summary: 'The Slingsby T.1/T.2 Falcon or British Falcon) was a single-seat sport glider produced, in 1931–37, by Fred Slingsby in Scarborough, Yorkshire.


 == Design and development ==
The Falcon was constructed from plans supplied by the Rhön-Rossitten Gesellschaft (the controlling body for gliding in Germany). Originally designed by Alexander Lippisch, the T.1 was a single-seat sport glider of moderate performance for its day.'


_________ Frederick Wilhelmsen _________
 -> Statement: Frederick Wilhelmsen was a Catholic philosopher.
 -> Model output: 0.39685505628585815 (Actual=1)
 -> Summary: 'Frederick D. Wilhelmsen (18 May 1923–21 May 1996) was a Catholic philosopher known for his explication and advancement of the Thomistic tradition. He also was a political commentator, assessing American politics and society from a traditionalist pe

In [13]:
for i, row in fn.iterrows():
    print(f"_________ {row['Title']} _________")
    print(f" -> Statement: {row['Statement']}")
    print(f" -> Model output: {row['Model output']} (Actual={row['Label']})")
    print(f" -> Summary: '{row['Summary']}'")
    print("\n")

_________ HMS Electra (1808) _________
 -> Statement: HMS Electra was a 50-gun battleship.
 -> Model output: 0.7559460401535034 (Actual=0)
 -> Summary: 'HMS Electra was a 16-gun brig-sloop. She was built by the Enterprise Ethéart, Saint-Malo, as the French Curieux-class brig Espiègle and launched in 1804. She was armed in 1807 at Saint Servan. The British frigate Sybille captured her on 16 August 1808.'


_________ 3409 Abramov _________
 -> Statement: 3409 Abramov is a gaseous asteroid.
 -> Model output: 0.5934113264083862 (Actual=0)
 -> Summary: '3409 Abramov, provisional designation 1977 RE6, is a stony Koronian asteroid from the outer region of the asteroid belt, approximately 11 kilometers in diameter. It was discovered on 9 September 1977, by Soviet–Russian astronomer Nikolai Chernykh at Crimean Astrophysical Observatory in Nauchnyj on the Crimean peninsula. The asteroid was named after Russian writer Fyodor Abramov.


'


_________ 2018–19 CSA T20 Challenge _________
 -> Stateme

## **Edit input to model to evalue it**

In [14]:
# Correctly predicted
# label = 1
claim = 'Transportation Alternatives works to change transportation priorities.'
evidence = 'Transportation Alternatives (TransAlt, formerly T.A.) is a non-profit organization in New York City which works to change New York City\'s transportation priorities to encourage and increase non-polluting, quiet, city-friendly travel and decrease automobile use. TransAlt seeks a transportation system based on a "Green Transportation Hierarchy" giving preference to modes of travel based on their relative benefits and costs to society.'
mod_pred(claim, evidence)

0.9579151

In [15]:
# Change wording, still true (correct)
claim = 'Transportation Alternatives strives to shift the focus of transportation priorities.'
evidence = 'Transportation Alternatives (TransAlt, formerly T.A.) is a non-profit organization in New York City which works to change New York City\'s transportation priorities to encourage and increase non-polluting, quiet, city-friendly travel and decrease automobile use. TransAlt seeks a transportation system based on a "Green Transportation Hierarchy" giving preference to modes of travel based on their relative benefits and costs to society.'

mod_pred(claim, evidence)

0.95605856

In [16]:
# Change worning, make it false (correct)
claim = 'Transportation Alternatives is a petroleum company.'
evidence = 'Transportation Alternatives (TransAlt, formerly T.A.) is a non-profit organization in New York City which works to change New York City\'s transportation priorities to encourage and increase non-polluting, quiet, city-friendly travel and decrease automobile use. TransAlt seeks a transportation system based on a "Green Transportation Hierarchy" giving preference to modes of travel based on their relative benefits and costs to society.'

mod_pred(claim, evidence)

0.18087105

In [17]:
# Add 'not', make it false (corret)
claim = 'The Transportation Alternatives does not work to change transportation priorities.'
evidence = 'Transportation Alternatives (TransAlt, formerly T.A.) is a non-profit organization in New York City which works to change New York City\'s transportation priorities to encourage and increase non-polluting, quiet, city-friendly travel and decrease automobile use. TransAlt seeks a transportation system based on a "Green Transportation Hierarchy" giving preference to modes of travel based on their relative benefits and costs to society.'

mod_pred(claim, evidence)

0.13681345

In [18]:
# Add 'not' and change menaing, make it true (incorrect)
claim = 'Transportation Alternatives does not support polluting'
evidence = 'Transportation Alternatives (TransAlt, formerly T.A.) is a non-profit organization in New York City which works to change New York City\'s transportation priorities to encourage and increase non-polluting, quiet, city-friendly travel and decrease automobile use. TransAlt seeks a transportation system based on a "Green Transportation Hierarchy" giving preference to modes of travel based on their relative benefits and costs to society.'

mod_pred(claim, evidence)
# Take: not --> False statement due to data bias

0.12414804