In [None]:
!wget http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz
!tar -xvzf ./20news-bydate.tar.gz

In [2]:
import os

def read_files_from_directories(base_directory):
    """
    Reads and concatenates the content of all files within the specified directory and its subdirectories.
    
    :param base_directory: The root directory to start reading files from.
    :return: A list of strings, each representing the concatenated content of one file.
    """
    all_file_contents = []

    # Walk through all directories and files in the base directory
    for current_dir, _, files in os.walk(base_directory):
        for file_name in files:
            file_path = os.path.join(current_dir, file_name)
            
            try:
                # Open the file and read its content
                with open(file_path, 'r', errors='ignore') as file:
                    file_content = file.read()
                    # Clean the content by replacing newlines and tabs with spaces
                    cleaned_content = file_content.replace('\n', ' ').replace('\t', ' ')
                    all_file_contents.append(cleaned_content)
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")

    return all_file_contents

# Define the root directory where the files are located
root_directory = '20news-bydate-test'

# Read the contents of all files in the root directory
all_file_contents = read_files_from_directories(root_directory)

# Join all contents into a single string (optional step depending on use case)
all_file_contents_joined = ' '.join(all_file_contents)

In [None]:
all_file_contents_joined

In [None]:
import re

# Regular expression pattern to match sentence-ending punctuation followed by whitespace or end of string
sentence_end_pattern = r'([.!?])(?=\s|$)'

def split_sentences_with_end_symbols(text):
    """
    Splits the input text into sentences based on sentence-ending punctuation marks (. ! ?).
    
    :param text: The input text to be split into sentences.
    :return: A list of sentences.
    """
    parts = re.split(sentence_end_pattern, text)
    sentences = [''.join([parts[i], parts[i + 1]]) for i in range(0, len(parts) - 1, 2)]
    return sentences

# Assuming all_file_contents_joined is the concatenated content of all files
all_file_contents_joined = ' '.join(all_file_contents)

# Split the concatenated content into sentences
sentences = split_sentences_with_end_symbols(all_file_contents_joined)

# Print each sentence
for sentence in sentences:
    print(sentence)

In [25]:
import re

# Regular expression patterns with examples in comments

# Matches words, emails, numbers, and non-word non-space characters
word_pattern = r'\b[\w\'-]+(?:@[\w.]+)?\b[^\w\s]*'
# Examples: "hello", "world's", "email@example.com", "123", "@", "#"

# Matches email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
# Examples: "user@example.com", "john.doe@company.co.uk", "test123@test-domain.org"

# Matches Chinese phone numbers
phone_china_pattern = r"^(?:\+86)(?:(?:-\d{3}-|\(\d{3}\))\d{4}-\d{4}|\d{11})"
# Examples: "+86-10-1234-5678", "+86(10)1234-5678", "+8612345678901"

# Matches Russian phone numbers
phone_ru_pattern = r"^(?:\+7|8)(?:(?:-\d{3}-|\(\d{3}\))\d{3}-\d{2}-\d{2}|\d{10})"
# Examples: "+7-123-456-78-90", "+7(123)456-78-90", "81234567890"

# Matches US phone numbers
phone_usa_pattern = r"^(?:\+1)(?:(?:-\d{3}-|\(\d{3}\))\d{3}-\d{4}|\d{10})"
# Examples: "+1-123-456-7890", "+1(123)456-7890", "+11234567890"

# General phone number pattern
phone_pattern = r'(\+\d{1,3}\s?)?(?:\(\d{1,3}\)|\d{1,3})[-.\s]?\d{1,4}[-.\s]?\d{1,4}(?:[-.\s]?\d{1,4})?'
# Examples: "+1 123-456-7890", "(123)456-7890", "123 456 7890", "+44 123 456 7890"

# Matches ordinal numbers and possessives
numeral_pattern = r"\b\d+(?:th|'s)\b"
# Examples: "1st", "2nd", "3rd", "4th", "123's"

# Matches dates
dates_pattern = r'\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b'
# Examples: "01/01/2023", "01-01-2023", "01.01.2023", "1/1/23"

# Matches times
times_pattern = r'\b(?:[01]?[0-9]|2[0-3]):[0-5][0-9]\b'
# Examples: "12:34", "09:45", "1:00", "23:59"

# Matches emojis
emoji_pattern = r'[:;][-]*[)D\(\[\]PpOo/\\]'
# Examples: :-), :)

# Matches mathematical formulas
math_formula_pattern = r'\b[a-zA-Z]\s*=\s*[a-zA-Z0-9\+\-\*/\(\)\[\]\^,\s]+\s*;?\b'
# Example: a + b = c

# Compile regular expressions for efficiency
word_re = re.compile(word_pattern)
email_re = re.compile(email_pattern)
phone_re = re.compile(f"({phone_pattern})|({phone_ru_pattern})|({phone_usa_pattern})|({phone_china_pattern})")
numeral_re = re.compile(numeral_pattern)
dates_re = re.compile(dates_pattern)
times_re = re.compile(times_pattern)
emoji_re = re.compile(emoji_pattern)
math_formula_re = re.compile(math_formula_pattern)

pattern_map = {
    'math': math_formula_re,
    'email': email_re,
    'phone': phone_re,
    'numeral': numeral_re,
    'date': dates_re,
    'time': times_re,
    'emoji': emoji_re,
    'word': word_re
}

def tokenize(sentence):
    tokens = []
    index = 0
    while index < len(sentence):
        for pattern_name, pattern in pattern_map.items():
            match = pattern.match(sentence, index)
            if match:
                token = match.group()
                tokens.append(token)
                index = match.end()
                break
        else:
            # If no match is found, move to the next character
            index += 1
    return tokens

# Example usage
sentence = "Hello world! My email is user@example.com. Phone: +1-123-456-7890. Date: 01/01/2023. Time: 12:34. Math: a + b = c."
tokens = tokenize(sentence)
print(tokens)

In [None]:
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to convert NLTK POS tags to WordNet tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

In [8]:
def get_stub_stem(token):
    stem = stemmer.stem(token)
    return stem

def get_stub_lemma(token, pos_tag=None):
    if pos_tag:
        lemma = lemmatizer.lemmatize(token, pos=pos_tag)
    else:
        lemma = lemmatizer.lemmatize(token)
    return lemma

def create_annotation(sentences):
    annotations = []
    for sentence_index, sentence in enumerate(sentences):
        tokens = tokenize(sentence)
        tagged_tokens = nltk.pos_tag(tokens)

        for token_index, (token, tag) in enumerate(tagged_tokens):
            pos_tag = get_wordnet_pos(tag)
            stem = get_stub_stem(token)
            lemma = get_stub_lemma(token, pos_tag)
            annotations.append(f"{sentence_index + 1}_{token_index + 1}\t{token}\t{pos_tag}\t{stem}\t{lemma}")
        annotations.append("")
    return annotations

In [12]:
annotations = create_annotation(sentences)

with open('annotations_hsy.tsv', 'w') as file:
    file.write("\n".join(annotations))

In [None]:
# Test homonyms (duck)

sentences_homonym = [
    "Children like to feed the ducks in the pond at the park.",
    "The boy is throwing a rock at my head, but I am ducking so it doesn’t hit me."
]

annotations_homonyms = create_annotation(sentences_homonym)

for annotation in annotations_homonyms:
    print(annotation)

In [13]:
if 'TestTokenizeFunction' in globals():
    del globals()['TestTokenizeFunction']

In [None]:
import unittest

class TestTokenize(unittest.TestCase):
    def test_email(self):
        original = "My email is test@example.com"
        expected = ['My', 'email', 'is', 'test@example.com']
        self.assertEqual(tokenize(original), expected)

    def test_phone(self):
        original = "My phone is +1 123-456-7890"
        expected = ['My', 'phone', 'is', '+1 123-456-7890']
        self.assertEqual(tokenize(original), expected)

    def test_date_and_time(self):
        original = "Now the time is 2:13 01/10/2024"
        expected = ['Now', 'the', 'time', 'is', '2:13', '01/10/2024']
        self.assertEqual(tokenize(original), expected)

    def test_emoji(self):
        original = "Hello :)"
        expected = ['Hello', ':)']
        self.assertEqual(tokenize(original), expected)

    def test_mix(self):
        self.maxDiff = None
        original = "Children like to feed the ducks in the pond at the park. The boy is throwing a rock at my head, but I am ducking so it doesn’t hit me. Email: example@example.com, Phone: +1 123-456-7890, Date: 10/01/2023, Time: 13:45. I'm happy :) and you? a = b*c;"
        expected = ['Children', 'like', 'to', 'feed', 'the', 'ducks', 'in', 'the', 'pond', 'at', 'the', 'park.', 'The', 'boy', 'is', 'throwing', 'a', 'rock', 'at', 'my', 'head,', 'but', 'I', 'am', 'ducking', 'so', 'it', 'doesn’', 't', 'hit', 'me.', 'Email:', 'example@example.com', 'Phone:', '+1 123-456-7890', 'Date:', '10/01/2023', 'Time:', '13:45', "I'm", 'happy', ':)', 'and', 'you?', 'a = b*c']
        self.assertEqual(tokenize(original), expected)


if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)