# SICK dataset

In [87]:
import requests
import zipfile
import io
import pandas as pd

# Step 1: Download the SICK.zip
url = "https://zenodo.org/record/2787612/files/SICK.zip?download=1"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("./sick_data/")

# Step 2: Read the SICK.txt manually
df = pd.read_csv("./sick_data/SICK.txt", sep='\t')

In [88]:
# Step 3: View examples
df[['sentence_A','sentence_B','entailment_label','relatedness_score','entailment_AB','entailment_BA']].head(15)  # View first 15 examples

Unnamed: 0,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA
0,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,NEUTRAL,4.5,A_neutral_B,B_neutral_A
1,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.2,A_contradicts_B,B_neutral_A
2,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,ENTAILMENT,4.7,A_entails_B,B_entails_A
3,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,CONTRADICTION,3.6,A_contradicts_B,B_contradicts_A
4,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.4,A_neutral_B,B_neutral_A
5,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.3,A_neutral_B,B_neutral_A
6,A group of boys in a yard is playing and a man...,The young boys are playing outdoors and the ma...,NEUTRAL,3.7,A_neutral_B,B_neutral_A
7,A group of children is playing in the house an...,The young boys are playing outdoors and the ma...,NEUTRAL,3.0,A_neutral_B,B_contradicts_A
8,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.7,A_neutral_B,B_neutral_A
9,A brown dog is attacking another animal in fro...,A brown dog is attacking another animal in fro...,ENTAILMENT,4.9,A_entails_B,B_neutral_A


In [89]:
df_relation = df[['sentence_A', 'sentence_B', 'relatedness_score']].copy()

# Normalize relatedness score to [0,1] range (original is likely 1–5)
df_relation['relatedness_score'] = df_relation['relatedness_score'].clip(lower=0, upper=5) / 5.0

df_relation = df_relation[
    df_relation.apply(
        lambda row: len(row['sentence_A'].split()) <= 10 and len(row['sentence_B'].split()) <= 10,
        axis=1
    )
]
# Keep only top 250 rows
df_relation = df_relation.head(100)

# Save to CSV
df_relation.to_csv('SICKrelatedness.csv', index=False)

In [90]:
df_relation.head(10)

Unnamed: 0,sentence_A,sentence_B,relatedness_score
11,Two dogs are fighting,Two dogs are wrestling and hugging,0.8
12,Two dogs are wrestling and hugging,There is no dog wrestling and hugging,0.66
21,A skilled person is riding a bicycle on one wheel,A person is riding the bicycle on one wheel,0.86
22,Nobody is riding the bicycle on one wheel,A person is riding the bicycle on one wheel,0.82
40,Two people are kickboxing and spectators are w...,Two people are fighting and spectators are wat...,0.88
41,Two people are kickboxing and spectators are n...,Two people are kickboxing and spectators are w...,0.68
42,Two spectators are kickboxing and some people ...,Two people are kickboxing and spectators are w...,0.8
43,Two young women are sparring in a kickboxing f...,Two women are sparring in a kickboxing match,0.98
44,Two young women are not sparring in a kickboxi...,Two women are sparring in a kickboxing match,0.78
45,Two people are kickboxing and spectators are w...,Two young women are sparring in a kickboxing f...,0.78


In [94]:
# --- Step 2: Create df_inference ---
def generate_entailment_rows(row):
    s1, s2 = row['sentence_A'], row['sentence_B']
    if len(s1.split()) > 10 or len(s2.split()) > 10:
        pass
    else:
        label = row['entailment_label']
        rows = []
        if label == 'ENTAILMENT':
            if row['entailment_AB'] == 'A_entails_B':
                rows.append((s1, s2, 1.0))
            else: 
                rows.append((s1, s2, 0.5))
            if row['entailment_BA'] == 'B_entails_A':
                rows.append((s2, s1, 1.0))
            else: 
                rows.append((s1, s2, 0.5))
        elif label == 'CONTRADICTION':
            rows.append((s1, s2, 0.0))
        else:  # NEUTRAL
            rows.append((s1, s2, 0.5))    
    return rows

df_inference = df.copy()

df_inference = df_inference[
    df_inference.apply(
        lambda row: len(row['sentence_A'].split()) <= 10 and len(row['sentence_B'].split()) <= 10,
        axis=1
    )
]
# Flatten the list of lists
rows = sum(df_inference.apply(generate_entailment_rows, axis=1), [])

# Create the final DataFrame
df_inference = pd.DataFrame(rows, columns=['sentence_1', 'sentence_2', 'entailment_score'])

In [95]:
df_inference.head(15)

Unnamed: 0,sentence_1,sentence_2,entailment_score
0,Two dogs are fighting,Two dogs are wrestling and hugging,0.5
1,Two dogs are wrestling and hugging,There is no dog wrestling and hugging,0.0
2,A skilled person is riding a bicycle on one wheel,A person is riding the bicycle on one wheel,1.0
3,A skilled person is riding a bicycle on one wheel,A person is riding the bicycle on one wheel,0.5
4,Nobody is riding the bicycle on one wheel,A person is riding the bicycle on one wheel,0.0
5,Two people are kickboxing and spectators are w...,Two people are fighting and spectators are wat...,1.0
6,Two people are kickboxing and spectators are w...,Two people are fighting and spectators are wat...,0.5
7,Two people are kickboxing and spectators are n...,Two people are kickboxing and spectators are w...,0.0
8,Two spectators are kickboxing and some people ...,Two people are kickboxing and spectators are w...,0.5
9,Two young women are sparring in a kickboxing f...,Two women are sparring in a kickboxing match,1.0


In [84]:
avg_len = df_inference['sentence_1'].apply(lambda s: len(s.split())).mean()
print("Average sentence length:", avg_len)

Average sentence length: 9.776005054892979


In [85]:
df.head()

Unnamed: 0,"The motorcycle show drew a massive crowd.,A man is playing loudly on a guitar,0.10388225317001343,False"
0,"The man is playing the guitar,He stored his bi..."
1,The park is a perfect place for a bicycle picn...
2,The bicycle chain slipped off during the uphil...
3,The motorcycle tour covered five national park...
4,"A man is playing guitar for a friend,""The moto..."


In [96]:
# Save to CSV
df_inference = df_inference.head(100)
df_inference.to_csv('SICKinference.csv', index=False)