# PHI-3

In [1]:
# IMPORTS
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


### Initialise Model & Tokeniser

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",  
    torch_dtype="auto",  
    trust_remote_code=True
)

device = torch.device("mps")
model = model.to(device)

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  7.47it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 10.25it/s]


### Get Predictions & Metrics

In [3]:
def predictions(model, tokenizer, df):
    model.eval()
    all_preds = []
    prompts = df['text'].tolist()
    total = len(prompts)

    with torch.no_grad():
        for i, prompt in enumerate(prompts):
            # Create the system and user prompts
            system_prompt = "You are a helpful assistant."
            user_prompt = f"Tell me if this text is real or fake news. Only answer with the word 'real' or 'fake'.\n{prompt}"
            full_prompt = f"<|system|>{system_prompt}<|user|>{user_prompt}<|assistant|>"
            
            # Tokenize the combined prompt
            inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True).to(device)

            # Obtain model output
            output = model.generate(**inputs, max_new_tokens=5, eos_token_id=tokenizer.eos_token_id)
            
            # Decode the generated output to text
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True).strip()

            # Extract the answer by taking the last word, with normalization
            predicted_text = generated_text.strip().split()[-1].lower()
            
            # Check if the predicted text is "real" or "fake" and map it to 1 or 0
            if predicted_text == 'real':
                prediction = 1
            elif predicted_text == 'fake':
                prediction = 0
            else:
                prediction = None

            all_preds.append(prediction)
            
            processed_items = i + 1
            remaining_items = total - processed_items
            print(f"Processed {processed_items} items, {remaining_items} items remaining")

    # Create a DataFrame with original data and predictions
    results_df = df.copy()
    results_df['prediction'] = all_preds

    return results_df



# Function to truncate text to a certain number of tokens
# Ignore warning about tokenised sequence length
def truncate_text(text, tokenizer, max_tokens):
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return tokenizer.decode(tokens, skip_special_tokens=True)



# Function to get the results table
def get_results_table(results_path, df):

    if os.path.exists(results_path):
        print(f"Results file already exists at {results_path}. Loading existing results.")
        results_df = pd.read_csv(results_path)
    else:
        print("Results file does not exist. Running predictions.")
        df['text'] = df['text'].apply(lambda x: truncate_text(x, tokenizer, max_tokens=256))
        results_df = predictions(model, tokenizer, df)
        results_df.to_csv(results_path, index=False)
        print(f"Results saved to {results_path}.")
    return results_df



def get_metrics(results_df):
  
    original_misinformations = results_df[results_df['is_true'] == 0]
    correct_predictions = original_misinformations[original_misinformations['prediction'] == 0].shape[0]
    success_rate = 100*(correct_predictions / original_misinformations.shape[0]) if original_misinformations.shape[0] > 0 else 0

    metrics = classification_report(results_df['is_true'], results_df['prediction'])
    
    return success_rate, metrics



def get_metrics_classwise(results_df):
    original_misinformations = results_df[results_df['is_true'] == 0]
    correct_predictions = original_misinformations[original_misinformations['prediction'] == 0].shape[0]
    success_rate = 100*(correct_predictions / original_misinformations.shape[0]) if original_misinformations.shape[0] > 0 else 0

    classwise_success_rates = {}
    for category in results_df['label'].unique():
        category_df = results_df[results_df['label'] == category]
        original_misinformations = category_df[category_df['is_true'] == 0]
        correct_predictions = original_misinformations[original_misinformations['prediction'] == 0].shape[0]
        classwise_success_rate = 100 * (correct_predictions / original_misinformations.shape[0]) if original_misinformations.shape[0] > 0 else 0
        classwise_success_rates[category] = classwise_success_rate

    metrics = classification_report(results_df['is_true'], results_df['prediction'])

    return success_rate, classwise_success_rates, metrics

In [4]:
# HUMAN TEST SET
df = pd.read_csv("/Applications/AI/msc_project/data/my_coaid_test.csv")
results_path='/Applications/AI/msc_project/predictions/my_coaid_test_predictions_phi3.csv'
results_df = get_results_table(results_path=results_path, df=df)
results_df = results_df.dropna()
display(results_df)

success_rate, metrics = get_metrics(results_df)
print(metrics)
print(f"Success rate: {success_rate:.2f}%")

Results file does not exist. Running predictions.


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


Processed 1 items, 1044 items remaining
Processed 2 items, 1043 items remaining
Processed 3 items, 1042 items remaining
Processed 4 items, 1041 items remaining
Processed 5 items, 1040 items remaining
Processed 6 items, 1039 items remaining
Processed 7 items, 1038 items remaining
Processed 8 items, 1037 items remaining
Processed 9 items, 1036 items remaining
Processed 10 items, 1035 items remaining
Processed 11 items, 1034 items remaining
Processed 12 items, 1033 items remaining
Processed 13 items, 1032 items remaining
Processed 14 items, 1031 items remaining
Processed 15 items, 1030 items remaining
Processed 16 items, 1029 items remaining
Processed 17 items, 1028 items remaining
Processed 18 items, 1027 items remaining
Processed 19 items, 1026 items remaining
Processed 20 items, 1025 items remaining
Processed 21 items, 1024 items remaining
Processed 22 items, 1023 items remaining
Processed 23 items, 1022 items remaining
Processed 24 items, 1021 items remaining
Processed 25 items, 1020 

Unnamed: 0,is_true,text,prediction
0,1,USDA Meat Inspector Dies of Coronavirus,0.0
2,1,20% of US COVID-19 Deaths Were Young Adults,0.0
3,1,Respiratory viruses may spread via airborne dust,0.0
4,1,Uses of Telehealth during COVID-19 in Low Reso...,1.0
5,1,California Creates Relief Fund for Immigrants,1.0
...,...,...,...
1037,1,Personal Protective Equipment (PPE) Burn Rate ...,1.0
1038,1,People Who Test Positive for COVID-19 After Re...,1.0
1040,1,New Study of Diabetes Drug for COVID-19 Raises...,0.0
1042,1,Loss of Smell and Taste Validated as COVID-19 ...,1.0


              precision    recall  f1-score   support

           0       0.17      0.90      0.29        49
           1       0.98      0.58      0.73       512

    accuracy                           0.61       561
   macro avg       0.58      0.74      0.51       561
weighted avg       0.91      0.61      0.69       561

Success rate: 89.80%


In [5]:
# LLM TEST SET
df = pd.read_csv("/Applications/AI/msc_project/data/my_llm_fake_coaid_test.csv")
results_path='/Applications/AI/msc_project/predictions/my_llm_fake_coaid_test_predictions_phi3.csv'
results_df = get_results_table(results_path=results_path, df=df)
results_df = results_df.dropna()
display(results_df)

success_rate, classwise_success_rates, metrics = get_metrics_classwise(results_df)
print(metrics)
print(f"Success rate: {success_rate:.2f}%")
print(classwise_success_rates)

Results file does not exist. Running predictions.
Processed 1 items, 51 items remaining
Processed 2 items, 50 items remaining
Processed 3 items, 49 items remaining
Processed 4 items, 48 items remaining
Processed 5 items, 47 items remaining
Processed 6 items, 46 items remaining
Processed 7 items, 45 items remaining
Processed 8 items, 44 items remaining
Processed 9 items, 43 items remaining
Processed 10 items, 42 items remaining
Processed 11 items, 41 items remaining
Processed 12 items, 40 items remaining
Processed 13 items, 39 items remaining
Processed 14 items, 38 items remaining
Processed 15 items, 37 items remaining
Processed 16 items, 36 items remaining
Processed 17 items, 35 items remaining
Processed 18 items, 34 items remaining
Processed 19 items, 33 items remaining
Processed 20 items, 32 items remaining
Processed 21 items, 31 items remaining
Processed 22 items, 30 items remaining
Processed 23 items, 29 items remaining
Processed 24 items, 28 items remaining
Processed 25 items, 27 

Unnamed: 0,label,text,news_id,seed_text,is_true,prediction
0,llm_paraphrase,"""The main cause of people's deaths isn't the c...",4815,“It’s not the coronavirus that’s killing peopl...,0,0.0
1,llm_paraphrase,"A picture displays ex-President Barack Obama, ...",4869,"Photo shows former President Barack Obama, Dr....",0,0.0
2,llm_open_generation,Title: Debunked: No Credible Evidence Links 5G...,4862,5G generates the coronavirus SARS-CoV-2 in hum...,0,1.0
3,llm_open_generation,Headline: Crowded Beach in Bloemfontein Raises...,4745,The photo shows a packed beach in Bloemfontein...,0,0.0
5,llm_paraphrase,"According to the passage, individuals who are ...",5049,Says for otherwise healthy people Òexperiencin...,0,0.0
6,llm_paraphrase,"According to a video, it is claimed that the U...",5377,A video says Òthe coronavirus was invented and...,0,0.0
7,llm_paraphrase,A video that has become popular on the interne...,4719,Viral video reveals there is a coronavirus vac...,0,0.0
8,llm_rewritten,"According to Governor Tony Evers' directive, i...",5092,"Says that under order from Gov. Tony Evers, he...",0,0.0
9,llm_rewritten,"According to widely shared Facebook posts, it ...",4853,Facebook posts shared thousands of times claim...,0,0.0
10,llm_rewritten,"According to a widely circulated post, it has ...",5435,A post says the coronavirus can be slowed or s...,0,0.0


              precision    recall  f1-score   support

         0.0       1.00      0.82      0.90        34
         1.0       0.00      0.00      0.00         0

    accuracy                           0.82        34
   macro avg       0.50      0.41      0.45        34
weighted avg       1.00      0.82      0.90        34

Success rate: 82.35%
{'llm_paraphrase': 92.85714285714286, 'llm_open_generation': 50.0, 'llm_rewritten': 91.66666666666666}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
