In [14]:
import pandas as pd
import re
from nltk.tokenize import TreebankWordTokenizer, ToktokTokenizer, TweetTokenizer
from statistics import mean

In [16]:
# Load the data
file_path = "dataset/line_index.tsv"
data = pd.read_csv(file_path, sep="\t", header=None, names=["key", "unused", "abstract"])

In [17]:
# 1. Basic Preprocessing
# Count the number of unique articles
num_unique_articles = data["key"].nunique()

# Calculate the mean length of abstracts in characters
abstract_lengths = data["abstract"].dropna().apply(len)
mean_abstract_length = abstract_lengths.mean()

print(f"Number of unique articles: {num_unique_articles}")
print(f"Mean length of abstracts in characters: {mean_abstract_length:.2f}")

Number of unique articles: 2906
Mean length of abstracts in characters: 50.40


In [18]:
# 2. Word-Level Preprocessing
# Split the abstracts into lists of words
word_lists = data["abstract"].dropna().apply(lambda x: x.split())

# Count the number of different words (unique vocabulary size)
vocabulary = set(word for word_list in word_lists for word in word_list)
num_unique_words = len(vocabulary)

print(f"Number of unique words in the vocabulary: {num_unique_words}")

# Tokenize using NLTK tokenizers
tokenizers = {
    "TreebankWordTokenizer": TreebankWordTokenizer(),
    "ToktokTokenizer": ToktokTokenizer(),
    "TweetTokenizer": TweetTokenizer()
}

# Compare tokenization results
for tokenizer_name, tokenizer in tokenizers.items():
    tokenized_words = data["abstract"].dropna().apply(lambda x: tokenizer.tokenize(x))
    num_tokens = tokenized_words.apply(len).sum()
    print(f"Number of tokens using {tokenizer_name}: {num_tokens}")

Number of unique words in the vocabulary: 4347
Number of tokens using TreebankWordTokenizer: 26179
Number of tokens using ToktokTokenizer: 26179
Number of tokens using TweetTokenizer: 26179


In [19]:
# 3. Domain Specificity and Regex
# Use regex to extract numbers (ints, floats, years, percentages)


                   key numbers  number_percentage
0  khm_0308_0011865648      []                0.0
1  khm_0308_0032157149      []                0.0
2  khm_0308_0038959268      []                0.0
3  khm_0308_0054635313      []                0.0
4  khm_0308_0055735195      []                0.0
