In [1]:
import pandas as pd
import json

from sklearn.model_selection import train_test_split

In [2]:
def parse_json_file_to_df(file_path):
    parsed_data = {
        'ID': [],
        'Rumor': [],
        'Label': [],
        'Timeline_URLs': [],
        'Timeline_Tweet_IDs': [],
        'Timeline_Contents': [],
        'Evidence_URLs': [],
        'Evidence_Tweet_IDs': [],
        'Evidence_Contents': []
    }
    
    with open(file_path, 'r') as file:
        for line in file:
            try:
                item = json.loads(line)
                
                parsed_data['ID'].append(item.get('id'))
                parsed_data['Rumor'].append(item.get('rumor'))
                parsed_data['Label'].append(item.get('label'))
                
                timeline_urls = []
                timeline_tweet_ids = []
                timeline_contents = []
                for timeline_entry in item.get('timeline', []):
                    timeline_urls.append(timeline_entry[0])
                    timeline_tweet_ids.append(timeline_entry[1])
                    timeline_contents.append(timeline_entry[2])
                parsed_data['Timeline_URLs'].append(timeline_urls)
                parsed_data['Timeline_Tweet_IDs'].append(timeline_tweet_ids)
                parsed_data['Timeline_Contents'].append(timeline_contents)
                
                evidence_urls = []
                evidence_tweet_ids = []
                evidence_contents = []
                for evidence_entry in item.get('evidence', []):
                    evidence_urls.append(evidence_entry[0])
                    evidence_tweet_ids.append(evidence_entry[1])
                    evidence_contents.append(evidence_entry[2])
                parsed_data['Evidence_URLs'].append(evidence_urls)
                parsed_data['Evidence_Tweet_IDs'].append(evidence_tweet_ids)
                parsed_data['Evidence_Contents'].append(evidence_contents)
                
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                continue
    
    df = pd.DataFrame(parsed_data)
    return df

In [3]:
file_path = 'data/English_train.json'

df = parse_json_file_to_df(file_path)
df

Unnamed: 0,ID,Rumor,Label,Timeline_URLs,Timeline_Tweet_IDs,Timeline_Contents,Evidence_URLs,Evidence_Tweet_IDs,Evidence_Contents
0,AuRED_014,“#Urgent / Ramallah Ministry of Health spokesm...,REFUTES,"[https://twitter.com/ibrahimmilhim, https://tw...","[1357270458756960257, 1357270456915685377, 135...","[""Qalqilya (5), Bethlehem (10), Nablus (24), J...","[https://twitter.com/palestine_moh, https://tw...","[1357211717479116800, 1356683687870488576, 135...",[Pictures from the launch of the vaccination c...
1,AuRED_037,Macron to Sky News: After my visit to Mrs. Fai...,REFUTES,"[https://twitter.com/skynewsarabia, https://tw...","[1302029928867729411, 1302028670446444544, 130...",[#Iraq.. Record infections with #Corona and ho...,"[https://twitter.com/salmanonline, https://twi...","[1301079467889131522, 1301067546200858626, 130...",[RT @HalaHomsi: Macron and Mrs. Fairouz: This ...
2,AuRED_085,Saudi Arabia evacuated 10 students from China ...,REFUTES,"[https://twitter.com/YSUCORG, https://twitter....","[1226482043007066113, 1226481466348986369, 122...",[RT @Yemen_PM: The Yemeni Embassy in #China an...,"[https://twitter.com/YSUCORG, https://twitter....","[1226482043007066113, 1225345358961762304, 122...",[RT @Yemen_PM: The Yemeni Embassy in #China an...
3,AuRED_089,"The Corona epidemic has reached the Emirates, ...",REFUTES,"[https://twitter.com/WHOEMRO, https://twitter....","[1223614769195900928, 1223608938136047616, 122...",[s. Is it safe to receive letters or packages ...,"[https://twitter.com/WHOEMRO, https://twitter....","[1222506828694794240, 1223649306667778049, 122...",[Today @WHO confirmed the emergence of the fir...
4,AuRED_135,The official spokesman for the Football Associ...,REFUTES,"[https://twitter.com/AlAhlyTV, https://twitter...","[1586138070692548610, 1586135058083381248, 158...",[Special- Dr. Khaled Al-Jawadi: Marcel is a di...,"[https://twitter.com/AlAhlyTV, https://twitter...","[1585012773125120000, 1585011015032315904]",[Islam Al-Shater “warns” against fabricating n...
...,...,...,...,...,...,...,...,...,...
91,AuRED_055,The Moroccan referee apologizes to Al-Ahly pla...,NOT ENOUGH INFO,"[https://twitter.com/AlAhly, https://twitter.c...","[1369774548585963524, 1369773418191740936, 136...","[ISSUE: couldn't translate, ISSUE: couldn't tr...",[],[],[]
92,AuRED_023,Good news for Al-Ahly fans.. BeIN Sports choos...,NOT ENOUGH INFO,"[https://twitter.com/derradjihafid, https://tw...","[1356265561634844674, 1356252307885875202, 135...","[""Leave him alone... National coach Jamal Belm...",[],[],[]
93,AuRED_016,Egypt does not want to give a vaccine to its c...,NOT ENOUGH INFO,"[https://twitter.com/mohpegypt, https://twitte...","[1391904405012746240, 1391875695807062019, 139...",[Health: The number of recoveries from those i...,[],[],[]
94,AuRED_077,"Egyptian Dr. Sir (Magdy Yacoub), the greatest ...",NOT ENOUGH INFO,"[https://twitter.com/FCDOArabic, https://twitt...","[1345122644589486080, 1345110666932342785, 134...",[RT @UKinSudan: Her Majesty Queen Elizabeth se...,[],[],[]


In [4]:
'''from stormtrooper import ZeroShotClassifier

sample_text = "It is the Electoral College's responsibility to elect the president."

model = ZeroShotClassifier("facebook/bart-large-mnli").fit(None, ["politics", "science", "other"])
predictions = model.predict([sample_text])
assert list(predictions) == ["politics"]

model.set_output(transform="pandas")
model.transform([sample_text])'''

'from stormtrooper import ZeroShotClassifier\n\nsample_text = "It is the Electoral College\'s responsibility to elect the president."\n\nmodel = ZeroShotClassifier("facebook/bart-large-mnli").fit(None, ["politics", "science", "other"])\npredictions = model.predict([sample_text])\nassert list(predictions) == ["politics"]\n\nmodel.set_output(transform="pandas")\nmodel.transform([sample_text])'

In [5]:
'''from stormtrooper import Text2TextZeroShotClassifier


classifier = Text2TextZeroShotClassifier(model_name="google/flan-t5-base", device="cpu").fit(df['Rumor'].astype('str').tolist(), df['Label'].astype('str').tolist())
'''

'from stormtrooper import Text2TextZeroShotClassifier\n\n\nclassifier = Text2TextZeroShotClassifier(model_name="google/flan-t5-base", device="cpu").fit(df[\'Rumor\'].astype(\'str\').tolist(), df[\'Label\'].astype(\'str\').tolist())\n'

In [6]:
'''
predictions = classifier.predict(["🚨🚨🚨🚨🚨🚨 Officially: Issam Chawali is commentating on the Champions League final 😍😍😍😍😍😍😍😍😍🔥🔥🔥🔥🔥🔥🔥. https://t.co/jGXOTmRlbD"])
predictions'''

'\npredictions = classifier.predict(["🚨🚨🚨🚨🚨🚨 Officially: Issam Chawali is commentating on the Champions League final 😍😍😍😍😍😍😍😍😍🔥🔥🔥🔥🔥🔥🔥. https://t.co/jGXOTmRlbD"])\npredictions'

In [7]:
import torch
print(torch.cuda.is_available())

True


In [8]:

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [9]:
from stormtrooper import SetFitZeroShotClassifier, SetFitFewShotClassifier

model = SetFitFewShotClassifier(model_name="sentence-transformers/all-mpnet-base-v2", device='cuda').fit(train_df['Rumor'].astype('str').tolist(), train_df['Label'].astype('str').tolist())

  from .autonotebook import tqdm as notebook_tqdm
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  self.trainer = SetFitTrainer(
Map: 100%|██████████| 86/86 [00:00<00:00, 4201.49 examples/s]
***** Running training *****
  Num unique pairs = 3440
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 215
  0%|          | 0/215 [00:00<?, ?it/s]

[A                                            

{'embedding_loss': 0.2124, 'learning_rate': 9.090909090909091e-07, 'epoch': 0.0}



[A                                             

{'embedding_loss': 0.2409, 'learning_rate': 1.7098445595854924e-05, 'epoch': 0.23}



[A                                              

{'embedding_loss': 0.1828, 'learning_rate': 1.1917098445595855e-05, 'epoch': 0.47}



[A                                              

{'embedding_loss': 0.0568, 'learning_rate': 6.735751295336788e-06, 'epoch': 0.7}



[A                                              

{'embedding_loss': 0.0073, 'learning_rate': 1.5544041450777204e-06, 'epoch': 0.93}



100%|██████████| 215/215 [01:44<00:00,  2.05it/s]


{'train_runtime': 105.4272, 'train_samples_per_second': 32.629, 'train_steps_per_second': 2.039, 'epoch': 1.0}


In [10]:
#predictions = model.predict([sample_text])
# model.score(test_df['Rumor'].astype('str').tolist(), test_df['Label'].astype('str').tolist())

prediction = model.predict(test_df['Rumor'].astype('str').tolist())

# calculate accuracy and f1 score base on the prediction and  true labels (test_df['Label'].astype('str').tolist())) 
from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(test_df['Label'].astype('str').tolist(), prediction, normalize=False)
f1 = f1_score(test_df['Label'].astype('str').tolist(), prediction, average='weighted')

In [11]:
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 7.0
F1 Score: 0.6782051282051282


In [12]:
from datasets import Dataset

def transform_df_to_dataset(df):
    transformed_data = {
        'text': df['Rumor'],
        'label': df['Label'],
        'label_text': df['Label'].apply(lambda x: 'True' if x == 1 else 'False')
    }
    
    dataset = Dataset.from_pandas(pd.DataFrame(transformed_data))
    return dataset


In [13]:
dataset = transform_df_to_dataset(df)
dataset

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 96
})

In [14]:
def split_dataset(dataset, test_size=0.2, random_state=42):
    train_df, test_df = train_test_split(dataset.to_pandas(), test_size=test_size, random_state=random_state)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    return train_dataset, test_dataset


In [15]:
train_dataset, test_dataset = split_dataset(dataset)
train_dataset, test_dataset

(Dataset({
     features: ['text', 'label', 'label_text', '__index_level_0__'],
     num_rows: 76
 }),
 Dataset({
     features: ['text', 'label', 'label_text', '__index_level_0__'],
     num_rows: 20
 }))

In [16]:
from setfit import SetFitModel, Trainer, TrainingArguments

# Initializing a new SetFit model
model = SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2", labels=['SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO'])

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [17]:
# Preparing the training arguments
args = TrainingArguments(
    batch_size=16,
    num_epochs=1,
    num_iterations=20,  # The number of text pairs to generate for contrastive learning\
)

# Preparing the trainer
trainer = Trainer(
    model=model,
    #args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset 
)
trainer.train()

Map: 100%|██████████| 76/76 [00:00<00:00, 2737.37 examples/s]
***** Running training *****
  Num unique pairs = 3656
  Batch size = 16
  Num epochs = 1
  Total optimization steps = 229
  0%|          | 0/229 [00:00<?, ?it/s]

[A                                            

{'embedding_loss': 0.2883, 'learning_rate': 8.695652173913044e-07, 'epoch': 0.0}



[A                                             

{'embedding_loss': 0.2553, 'learning_rate': 1.737864077669903e-05, 'epoch': 0.22}



[A                                              

{'embedding_loss': 0.1173, 'learning_rate': 1.2524271844660197e-05, 'epoch': 0.44}



[A                                              

{'embedding_loss': 0.0287, 'learning_rate': 7.66990291262136e-06, 'epoch': 0.66}



[A                                              

{'embedding_loss': 0.0028, 'learning_rate': 2.8155339805825245e-06, 'epoch': 0.87}



100%|██████████| 229/229 [02:18<00:00,  1.66it/s]


{'train_runtime': 138.2561, 'train_samples_per_second': 26.502, 'train_steps_per_second': 1.656, 'epoch': 1.0}


In [18]:
# Evaluating
metrics = trainer.evaluate(test_dataset)
metrics

***** Running evaluation *****


{'accuracy': 0.55}

In [19]:
'''from stormtrooper import GenerativeFewShotClassifier

model = GenerativeFewShotClassifier("upstage/SOLAR-10.7B-Instruct-v1.0t")
model.fit(train_df['Rumor'].astype('str').tolist(), train_df['Label'].astype('str').tolist())'''

'from stormtrooper import GenerativeFewShotClassifier\n\nmodel = GenerativeFewShotClassifier("upstage/SOLAR-10.7B-Instruct-v1.0t")\nmodel.fit(train_df[\'Rumor\'].astype(\'str\').tolist(), train_df[\'Label\'].astype(\'str\').tolist())'

## Running hyperparameter search

In [20]:
from setfit import SetFitModel

def model_init(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("sentence-transformers/all-mpnet-base-v2", **params)

In [21]:
def hp_space(trial):  # Training parameters
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 5),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16, 32, 64]),
        "seed": trial.suggest_int("seed", 1, 40),
        "num_iterations": trial.suggest_categorical("num_iterations", [5, 10, 20]),
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
    }

In [22]:
from datasets import Dataset
from setfit import SetFitTrainer

trainer = SetFitTrainer(
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    model_init=model_init,
    column_mapping={"text": "text", "label": "label"},
)
best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=20)

  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 76/76 [00:00<00:00, 3338.22 examples/s]
[I 2024-07-09 18:25:39,551] A new study created in memory with name: no-name-e233aa78-0571-477d-86cf-9d3c65d0d6fb
Trial: {'learning_rate': 1.8736184018650888e-06, 'num_epochs': 4, 'batch_size': 16, 'seed': 39, 'num_iterations': 10, 'max_iter': 95, 'solver': 'liblinear'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num unique pairs = 1520
  Batch size = 16
  Num epochs = 4
  Total optimization steps = 380
  0%|          | 0/380 [00:00<?, ?it/s]



{'embedding_loss': 0.2821, 'learning_rate': 5.263157894736843e-07, 'epoch': 0.01}


[W 2024-07-09 18:25:46,962] Trial 0 failed with parameters: {'learning_rate': 1.8736184018650888e-06, 'num_epochs': 4, 'batch_size': 16, 'seed': 39, 'num_iterations': 10, 'max_iter': 95, 'solver': 'liblinear'} because of the following error: OutOfMemoryError('CUDA out of memory. Tried to allocate 20.00 MiB. GPU ').
Traceback (most recent call last):
  File "/home/hmx/miniconda3/envs/rumour_verification/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/home/hmx/miniconda3/envs/rumour_verification/lib/python3.9/site-packages/setfit/integrations.py", line 27, in _objective
    trainer.train(trial=trial)
  File "/home/hmx/miniconda3/envs/rumour_verification/lib/python3.9/site-packages/setfit/trainer.py", line 410, in train
    self.train_embeddings(*full_parameters, args=args)
  File "/home/hmx/miniconda3/envs/rumour_verification/lib/python3.9/site-packages/setfit/trainer.py", line 462, in train_embeddings
    self._t

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [None]:
trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
trainer.train()