In [1]:
import json
import pandas as  pd



# Loading Data

In [2]:
TRAIN_JSON_FILE = "../data/train.json"
DEV_JSON_FILE = "../data/dev.json"

def load_data(file_path):
    """
    Loads the json containing the dataset and return a pandas dataframe.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Transpose because the json is {id: {features...}, ...}
    df = pd.DataFrame(data).T
    # Ensure 'average' is float
    df['average'] = df['average'].astype(float)
    # Ensure 'choices' is list (for scoring later)
    return df

df_train = load_data(TRAIN_JSON_FILE)
df_dev = load_data(DEV_JSON_FILE)

In [3]:
df_dev.reset_index()

Unnamed: 0,index,homonym,judged_meaning,precontext,sentence,ending,choices,average,stdev,nonsensical,sample_id,example_sentence
0,0,track,a pair of parallel rails providing a runway fo...,The detectives arrived at the abandoned train ...,They followed the track.,They began to run along the abandoned railway ...,"[4, 5, 3, 1, 5]",3.6,1.67332,"[False, False, False, False, False]",2371,The train glided smoothly along the track.
1,1,track,evidence pointing to a possible solution,The detectives arrived at the abandoned train ...,They followed the track.,They began to run along the abandoned railway ...,"[3, 3, 4, 4, 4]",3.6,0.547723,"[False, False, False, False, False]",2372,The detective found a crucial track.
2,2,track,a pair of parallel rails providing a runway fo...,The detectives arrived at the abandoned train ...,They followed the track.,They found interesting clues that helped them ...,"[5, 5, 2, 3, 4]",3.8,1.30384,"[False, False, False, False, False]",2373,The train glided smoothly along the track.
3,3,track,evidence pointing to a possible solution,The detectives arrived at the abandoned train ...,They followed the track.,They found interesting clues that helped them ...,"[4, 5, 4, 3, 5]",4.2,0.83666,"[False, False, False, False, False]",2374,The detective found a crucial track.
4,4,track,a pair of parallel rails providing a runway fo...,The detectives arrived at the abandoned train ...,They followed the track.,,"[1, 5, 4, 4, 1]",3.0,1.870829,"[False, False, False, False, False]",2375,The train glided smoothly along the track.
...,...,...,...,...,...,...,...,...,...,...,...,...
583,583,trailer,a wheeled vehicle that can be pulled by a car ...,"Emma glanced at her phone, checking the time. ...",She really didn't want to go to the trailer to...,She had no interest in going to the movies.,"[1, 1, 3, 1, 1]",1.4,0.894427,"[False, True, False, False, False]",3434,They traveled cross-country in their trailer.
584,584,trailer,an advertisement consisting of short scenes fr...,"Emma glanced at her phone, checking the time. ...",She really didn't want to go to the trailer to...,She still has nightmares from the last time sh...,"[2, 5, 1, 1, 1]",2.0,1.732051,"[False, False, False, False, False]",3435,I watched the new movie trailer online.
585,585,trailer,a wheeled vehicle that can be pulled by a car ...,"Emma glanced at her phone, checking the time. ...",She really didn't want to go to the trailer to...,She still has nightmares from the last time sh...,"[4, 5, 5, 5, 4]",4.6,0.547723,"[False, False, False, False, False]",3436,They traveled cross-country in their trailer.
586,586,trailer,an advertisement consisting of short scenes fr...,"Emma glanced at her phone, checking the time. ...",She really didn't want to go to the trailer to...,,"[3, 2, 1, 3, 2]",2.2,0.83666,"[False, False, False, False, False]",3437,I watched the new movie trailer online.


In [4]:
# define a function to check that the story has no ending
def check_ending(x):
    if x=="":
        return True
    else:
        return False

In [5]:
df_dev['has_no_ending'] = df_dev['ending'].apply(check_ending)

In [6]:
no_endings_dict = df_dev['has_no_ending'].to_dict()

# Splitting Predictions Part

In [14]:
from pathlib import Path
# helper to read jsonl
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

# load data
predictions = read_jsonl("../deberta-finetune-2/predictions.jsonl")
references  = read_jsonl("../deberta-finetune-2/ref.jsonl")

# your dictionary: id → True/False
# no_endings_dict = {...}

# output folders
Path("../deberta-finetune-2-no_ending").mkdir(exist_ok=True)
Path("../deberta-finetune-2-with_ending").mkdir(exist_ok=True)

# containers
preds_no = []
refs_no = []

preds_yes = []
refs_yes = []

# simple loop
for pred, ref in zip(predictions, references):
    pid = pred["id"]
    has_no_ending = no_endings_dict[pid]

    if has_no_ending:
        preds_no.append(pred)
        refs_no.append(ref)
    else:
        preds_yes.append(pred)
        refs_yes.append(ref)

# write helper
def write_jsonl(path, items):
    with open(path, "w", encoding="utf-8") as f:
        for x in items:
            f.write(json.dumps(x) + "\n")



def renumber(records):
    for i, rec in enumerate(records, start=0):
        rec["id"] = str(i)      # IMPORTANT: store as "1", "2", "3"...
    return records

# Renumber for each bucket
preds_no  = renumber(preds_no)
refs_no   = renumber(refs_no)
preds_yes = renumber(preds_yes)
refs_yes  = renumber(refs_yes)

# save
write_jsonl("../deberta-finetune-2-no_ending/predictions.jsonl", preds_no)
write_jsonl("../deberta-finetune-2-no_ending/ref.jsonl", refs_no)

write_jsonl("../deberta-finetune-2-with_ending/predictions.jsonl", preds_yes)
write_jsonl("../deberta-finetune-2-with_ending/ref.jsonl", refs_yes)

print("Done!")


Done!


In [15]:
len(preds_yes), len(refs_yes)

(392, 392)

In [16]:
import subprocess
import os
import sys

OUT_DIR_ENDING = "../deberta-finetune-2-with_ending"
ref_file = os.path.join(OUT_DIR_ENDING,"ref.jsonl")
pred_file = os.path.join(OUT_DIR_ENDING,"predictions.jsonl")
res = subprocess.run([sys.executable, "../score/scoring.py", ref_file, pred_file, os.path.join(OUT_DIR_ENDING, "score.json")], capture_output=True, text=True)


CompletedProcess(args=['python', '../score/scoring.py', '../deberta-finetune-2-with_ending\\ref.jsonl', '../deberta-finetune-2-with_ending\\predictions.jsonl', '../deberta-finetune-2-with_ending\\score.json'], returncode=1, stdout='Importing...\n', stderr='Traceback (most recent call last):\n  File "D:\\Polito\\First Year\\LLM4SE\\WSD_Project\\LLM-SemEval-T5\\score\\scoring.py", line 8, in <module>\n    from scipy.stats import spearmanr\nModuleNotFoundError: No module named \'scipy\'\n')

In [20]:
OUT_DIR_NO_ENDING = "../deberta-finetune-2-no_ending"
ref_file_no_end = os.path.join(OUT_DIR_NO_ENDING,"ref.jsonl")
pred_file_no_end = os.path.join(OUT_DIR_NO_ENDING,"predictions.jsonl")
res = subprocess.run([sys.executable, "../score/scoring.py", ref_file_no_end, pred_file_no_end, os.path.join(OUT_DIR_NO_ENDING, "score.json")], capture_output=True, text=True)

In [None]:
%%sql


Exception: Variable Name is not chosen

In [19]:
res

