# Descriptive Statistics for Dataset

In [1]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA18/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t', quoting=3) for file in tsv_files], ignore_index=True)

# Check column names
print(combined_data.columns)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts()

print("size: ", len(combined_data))
print("percentage of Metaphor: ", label_counts[1]/len(combined_data)*100)

# Remove duplicate sentences
combined_data = combined_data.drop_duplicates(subset='sentence')

# Calculate the average length of all sentences (unique sentences only)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.strip().split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (unique sentences): {average_length_all}')

Index(['index', 'label', 'sentence', 'POS', 'w_index'], dtype='object')
size:  180175
percentage of Metaphor:  13.196614402664075
Average sentence length (unique sentences): 17.194816373374138


In [2]:
import glob
import pandas as pd
import io

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA20/*.tsv')

def read_fixed_tsv(file_path):
    """Reads a TSV file, ensuring the first line uses tabs (\t) instead of spaces."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Fix the first line dynamically (convert spaces to tabs)
    first_line_fixed = '\t'.join(lines[0].strip().split())  # Ensure tab-separated headers
    data_lines = lines[1:]  # Keep the rest of the file unchanged

    # Convert the fixed content into a format pandas can read
    fixed_content = first_line_fixed + '\n' + ''.join(data_lines)
    
    # Read into pandas from in-memory string (without modifying file)
    df = pd.read_csv(io.StringIO(fixed_content), sep='\t', encoding='utf-8')

    return df

# Read all fixed TSV files into a single DataFrame
combined_data = pd.concat([read_fixed_tsv(file) for file in tsv_files], ignore_index=True)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts()

print("size: ", len(combined_data))
print("percentage of Metaphor: ", label_counts[1]/len(combined_data)*100)

# Remove duplicate sentences
combined_data = combined_data.drop_duplicates(subset='sentence')

# Calculate the average length of all sentences (unique sentences only)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (unique sentences): {average_length_all}')

size:  182281
percentage of Metaphor:  12.69797729878594
Average sentence length (unique sentences): 16.38779174147217


In [3]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/MOH-X/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t', quoting=3) for file in tsv_files], ignore_index=True)
combined_data = combined_data.drop_duplicates()

# Check column names
print(combined_data.columns)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts() 
print("size: ", len(combined_data))
print("percentage of Metaphor: ", label_counts[1]/len(combined_data)*100)

# Remove duplicate sentences
combined_data = combined_data.drop_duplicates(subset='sentence')

# Calculate the average length of all sentences (unique sentences only)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (unique sentences): {average_length_all}')

Index(['index', 'label', 'sentence', 'pos', 'v_index'], dtype='object')
size:  647
percentage of Metaphor:  48.68624420401855
Average sentence length (unique sentences): 6.904836193447738


In [4]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/TroFi/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t', quoting=3) for file in tsv_files], ignore_index=True)
combined_data = combined_data.drop_duplicates()

# Check column names
print(combined_data.columns)

# Count occurrences of each label
label_counts = combined_data['label'].value_counts() 
print("size: ", len(combined_data))
print("percentage of Metaphor: ", label_counts[1]/len(combined_data)*100)

# Remove duplicate sentences
combined_data = combined_data.drop_duplicates(subset='sentence')

# Calculate the average length of all sentences (unique sentences only)
all_sentences_length = combined_data['sentence'].apply(lambda x: len(x.split()))
average_length_all = all_sentences_length.mean()
print(f'Average sentence length (unique sentences): {average_length_all}')

Index(['index', 'label', 'sentence', 'pos', 'v_index'], dtype='object')
size:  3737
percentage of Metaphor:  43.53759700294354
Average sentence length (unique sentences): 25.732445184568416


# Percentage of Metaphor that Belongs to a Particular Word Class

In [5]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA18/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t') for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['POS'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB     29.534904
ADP      28.295524
NOUN     21.025663
ADJ       8.426031
DET       6.139967
ADV       3.481650
PART      2.299203
PROPN     0.516773
PRON      0.135762
PUNCT     0.096348
NUM       0.030656
INTJ      0.017518
Name: POS, dtype: float64


In [6]:
import glob
import pandas as pd
import io

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/VUA20/*.tsv')

def read_fixed_tsv(file_path):
    """Reads a TSV file, ensuring the first line uses tabs (\t) instead of spaces."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Fix the first line dynamically (convert spaces to tabs)
    first_line_fixed = '\t'.join(lines[0].strip().split())  # Ensure tab-separated headers
    data_lines = lines[1:]  # Keep the rest of the file unchanged

    # Convert the fixed content into a format pandas can read
    fixed_content = first_line_fixed + '\n' + ''.join(data_lines)
    
    # Read into pandas from in-memory string (without modifying file)
    df = pd.read_csv(io.StringIO(fixed_content), sep='\t', encoding='utf-8')

    return df

# Read all fixed TSV files into a single DataFrame
combined_data = pd.concat([read_fixed_tsv(file) for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['POS'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB     26.898816
ADP      24.284974
NOUN     23.235116
ADJ       8.861142
DET       6.126329
ADV       5.491230
PROPN     2.471269
AUX       2.138598
PRON      0.155534
INTJ      0.108010
SCONJ     0.077767
NUM       0.047524
X         0.047524
PART      0.034563
PUNCT     0.021602
Name: POS, dtype: float64


In [7]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/MOH-X/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t') for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['pos'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB    100.0
Name: pos, dtype: float64


In [8]:
# KEY DESCRIPTIONS STATISTICS
import glob
import pandas as pd

# Find all .tsv files in the directory
tsv_files = glob.glob('../data/TroFi/CLS/*.tsv')

# Combine all .tsv files into a single DataFrame
combined_data = pd.concat([pd.read_csv(file, delimiter='\t', quoting=3) for file in tsv_files], ignore_index=True)

# Filter rows where label equals 1
filtered_data = combined_data[combined_data['label'] == 1]

# Count occurrences of each POS tag in the filtered data
pos_counts = filtered_data['pos'].value_counts()

# Calculate the percentage of each POS tag
pos_percentage = (pos_counts / pos_counts.sum()) * 100

# Print the POS percentages
print(pos_percentage)


VERB    100.0
Name: pos, dtype: float64
