# Descriptive Statistics for Dataset

In [5]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA18/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t') for file in tsv_files], ignore_index=True)

# Check column names
print(combined_data.columns)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts()
print(label_counts)

print(len(combined_data))
print(tsv_files)

# Calculate the average length of all sentences (including duplicates)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (all sentences): {average_length_all}')


Index(['index', 'label', 'sentence', 'POS', 'w_index'], dtype='object')
0    150153
1     22834
Name: label, dtype: int64
172987
['../data/VUA18/train.tsv', '../data/VUA18/dev.tsv', '../data/VUA18/test.tsv']
Average sentence length (all sentences): 27.745136917803073


In [6]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/MOH-X/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t') for file in tsv_files], ignore_index=True)

# Check column names
print(combined_data.columns)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts()
print(label_counts)

print(len(combined_data))
print(tsv_files)

# Calculate the average length of all sentences (including duplicates)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (all sentences): {average_length_all}')

Index(['index', 'label', 'sentence', 'pos', 'v_index'], dtype='object')
0    3320
1    3150
Name: label, dtype: int64
6470
['../data/MOH-X/CLS/train0.tsv', '../data/MOH-X/CLS/test0.tsv', '../data/MOH-X/CLS/test4.tsv', '../data/MOH-X/CLS/train4.tsv', '../data/MOH-X/CLS/test8.tsv', '../data/MOH-X/CLS/test1.tsv', '../data/MOH-X/CLS/train5.tsv', '../data/MOH-X/CLS/train6.tsv', '../data/MOH-X/CLS/train2.tsv', '../data/MOH-X/CLS/test9.tsv', '../data/MOH-X/CLS/test2.tsv', '../data/MOH-X/CLS/train7.tsv', '../data/MOH-X/CLS/train9.tsv', '../data/MOH-X/CLS/test5.tsv', '../data/MOH-X/CLS/test7.tsv', '../data/MOH-X/CLS/test6.tsv', '../data/MOH-X/CLS/train3.tsv', '../data/MOH-X/CLS/test3.tsv', '../data/MOH-X/CLS/train1.tsv', '../data/MOH-X/CLS/train8.tsv']
Average sentence length (all sentences): 6.902627511591963


In [7]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/TroFi/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t', quoting=3) for file in tsv_files], ignore_index=True)

# Check column names
print(combined_data.columns)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts()
print(label_counts)

print(len(combined_data))
print(tsv_files)

# Calculate the average length of all sentences (including duplicates)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (all sentences): {average_length_all}')

Index(['index', 'label', 'sentence', 'pos', 'v_index'], dtype='object')
0    21100
1    16270
Name: label, dtype: int64
37370
['../data/TroFi/CLS/train0.tsv', '../data/TroFi/CLS/test0.tsv', '../data/TroFi/CLS/test4.tsv', '../data/TroFi/CLS/train4.tsv', '../data/TroFi/CLS/test8.tsv', '../data/TroFi/CLS/test1.tsv', '../data/TroFi/CLS/train5.tsv', '../data/TroFi/CLS/train6.tsv', '../data/TroFi/CLS/train2.tsv', '../data/TroFi/CLS/test9.tsv', '../data/TroFi/CLS/test2.tsv', '../data/TroFi/CLS/train7.tsv', '../data/TroFi/CLS/train9.tsv', '../data/TroFi/CLS/test5.tsv', '../data/TroFi/CLS/test7.tsv', '../data/TroFi/CLS/test6.tsv', '../data/TroFi/CLS/train3.tsv', '../data/TroFi/CLS/test3.tsv', '../data/TroFi/CLS/train1.tsv', '../data/TroFi/CLS/train8.tsv']
Average sentence length (all sentences): 25.693336901257695


In [8]:
import glob
import pandas as pd
import io

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA20/*.tsv')

def read_fixed_tsv(file_path):
    """Reads a TSV file, ensuring the first line uses tabs (\t) instead of spaces."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Fix the first line dynamically (convert spaces to tabs)
    first_line_fixed = '\t'.join(lines[0].strip().split())  # Ensure tab-separated headers
    data_lines = lines[1:]  # Keep the rest of the file unchanged

    # Convert the fixed content into a format pandas can read
    fixed_content = first_line_fixed + '\n' + ''.join(data_lines)
    
    # Read into pandas from in-memory string (without modifying file)
    df = pd.read_csv(io.StringIO(fixed_content), sep='\t', encoding='utf-8')

    return df

# Read all fixed TSV files into a single DataFrame
combined_data = pd.concat([read_fixed_tsv(file) for file in tsv_files], ignore_index=True)

# # Print column names to verify correct parsing
# print("Column Names:", combined_data.columns.tolist())

# # Print first few rows to check correct structure
# print(combined_data.head())

# Count occurrences of each label
label_counts = combined_data['label'].value_counts()
print(label_counts)

print(len(combined_data))
print(tsv_files)

# Calculate the average length of all sentences (including duplicates)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (all sentences): {average_length_all}')


0    159135
1     23146
Name: label, dtype: int64
182281
['../data/VUA20/train.tsv', '../data/VUA20/test.tsv']
Average sentence length (all sentences): 27.66809486452236


# Percentage of Metaphor that Belongs to a Particular Word Class

In [10]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA18/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t') for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['POS'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB     29.534904
ADP      28.295524
NOUN     21.025663
ADJ       8.426031
DET       6.139967
ADV       3.481650
PART      2.299203
PROPN     0.516773
PRON      0.135762
PUNCT     0.096348
NUM       0.030656
INTJ      0.017518
Name: POS, dtype: float64


In [12]:
import glob
import pandas as pd
import io

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA20/*.tsv')

def read_fixed_tsv(file_path):
    """Reads a TSV file, ensuring the first line uses tabs (\t) instead of spaces."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Fix the first line dynamically (convert spaces to tabs)
    first_line_fixed = '\t'.join(lines[0].strip().split())  # Ensure tab-separated headers
    data_lines = lines[1:]  # Keep the rest of the file unchanged

    # Convert the fixed content into a format pandas can read
    fixed_content = first_line_fixed + '\n' + ''.join(data_lines)
    
    # Read into pandas from in-memory string (without modifying file)
    df = pd.read_csv(io.StringIO(fixed_content), sep='\t', encoding='utf-8')

    return df

# Read all fixed TSV files into a single DataFrame
combined_data = pd.concat([read_fixed_tsv(file) for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['POS'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB     26.898816
ADP      24.284974
NOUN     23.235116
ADJ       8.861142
DET       6.126329
ADV       5.491230
PROPN     2.471269
AUX       2.138598
PRON      0.155534
INTJ      0.108010
SCONJ     0.077767
NUM       0.047524
X         0.047524
PART      0.034563
PUNCT     0.021602
Name: POS, dtype: float64


In [14]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/MOH-X/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t') for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['pos'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB    100.0
Name: pos, dtype: float64


In [16]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/TroFi/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t', quoting=3) for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['pos'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB    100.0
Name: pos, dtype: float64


# Data Exploration

In [3]:
import pandas as pd

# Load data
train_data_path = '../data/VUA18/train.tsv'
train_data = pd.read_csv(train_data_path, delimiter='\t')

# Check column names
print(train_data.columns)

# Count occurrences of each label
label_counts = train_data['label'].value_counts()
print(label_counts)

Index(['index', 'label', 'sentence', 'POS', 'w_index'], dtype='object')
0    85177
1    12481
Name: label, dtype: int64


In [4]:
total_samples = label_counts[0] + label_counts[1]
num_classes = 2

class_weight_0 = total_samples / (label_counts[0] * num_classes)
class_weight_1 = total_samples / (label_counts[1] * num_classes)

print(f'Class weight for index 0: {class_weight_0}')
print(f'Class weight for index 1: {class_weight_1}')


Class weight for index 0: 0.5732650832971342
Class weight for index 1: 3.9122666453008574


In [5]:
print(f"percentag of class 0: {label_counts[0]/total_samples}")
print(f"percentag of class 1: {label_counts[1]/total_samples}")

percentag of class 0: 0.8721968502324439
percentag of class 1: 0.12780314976755616


# Retrieving Target Word's Definition from WordNet

In [1]:
import nltk
import pandas as pd
from nltk.corpus import wordnet as wn

In [2]:
# Load and sample the training data
train_data_path = '../data/VUA18/train.tsv'
train_data = pd.read_csv(train_data_path, delimiter='\t')

# Load and sample the validation data
val_data_path = '../data/VUA18/dev.tsv'
val_data = pd.read_csv(val_data_path, delimiter='\t')

# Load and sample the test data
test_data_path = '../data/VUA18/test.tsv'
test_data = pd.read_csv(test_data_path, delimiter='\t')

In [3]:
test_data.head()

Unnamed: 0,index,label,sentence,POS,w_index
0,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,0
1,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADJ,1
2,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,NOUN,2
3,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADP,3
4,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,DET,4


In [4]:
# Target words from the sentence
train_data['target'] = train_data.apply(lambda row: row['sentence'].split(' ')[row['w_index']], axis=1)
val_data['target'] = val_data.apply(lambda row: row['sentence'].split(' ')[row['w_index']], axis=1)
test_data['target'] = test_data.apply(lambda row: row['sentence'].split(' ')[row['w_index']], axis=1)

In [5]:
test_data.head()

Unnamed: 0,index,label,sentence,POS,w_index,target
0,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,0,Design:
1,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADJ,1,Crossed
2,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,NOUN,2,lines
3,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADP,3,over
4,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,DET,4,the


In [None]:
def get_basic_meaning(word):
    # regex to remove punctuations
    word = re.sub(r'[^\w\s]', '', word)
    word = word.lower().strip()
    synsets = wordnet.synsets(word)
    if synsets:
        return synsets[0].definition()
    else:
        print(f"Word {word} not found in WordNet")
        return ''

In [7]:
# Preprocess target words
train_data['target'] = train_data['target'].str.replace(r'\W+', '', regex=True)
val_data['target'] = val_data['target'].str.replace(r'\W+', '', regex=True)
test_data['target'] = test_data['target'].str.replace(r'\W+', '', regex=True)

In [8]:
test_data.head()

Unnamed: 0,index,label,sentence,POS,w_index,target
0,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,0,Design
1,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADJ,1,Crossed
2,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,NOUN,2,lines
3,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADP,3,over
4,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,DET,4,the


In [9]:
# Get the basic meaning of the target words
train_data['basic_meaning'] = train_data['target'].apply(get_basic_meaning)
val_data['basic_meaning'] = val_data['target'].apply(get_basic_meaning)
test_data['basic_meaning'] = test_data['target'].apply(get_basic_meaning)

In [10]:
test_data.head(15)

Unnamed: 0,index,label,sentence,POS,w_index,target,basic_meaning
0,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,0,Design,the act of working out the form of something (...
1,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADJ,1,Crossed,travel across or pass over
2,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,NOUN,2,lines,a formation of people or things one beside ano...
3,a3m-fragment02 45,1,Design: Crossed lines over the toytown tram: C...,ADP,3,over,(cricket) the division of play during which si...
4,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,DET,4,the,
5,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,ADJ,5,toytown,
6,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,6,tram,a conveyance that transports passengers or fre...
7,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,7,City,a large and densely populated urban area; may ...
8,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,NOUN,8,transport,something that serves as a means of transporta...
9,a3m-fragment02 45,0,Design: Crossed lines over the toytown tram: C...,VERB,9,could,
