In [None]:
import os
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [None]:

def tokenize_text(text):
    matches = []
    cleaned_tokens = []
    regex_patterns = {
        "phone_numbers": r'(?:\+\d{1,3}\s)?(?:\(\d{3}\)|\d{1,3})\s?-?\s?\d{3,4}\s?-?\s?\d{3,4}',
        "emails": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        "emoji": r'[:;8BX=][-\'`^]?[)D(|/\\]+|<3|/\(\s*\d+\s*[-+*xX]\s*\d+\s*\)|\B<3\b|\bhearts?\b',
    }
    for pattern in regex_patterns.values():
        matches.extend(re.findall(pattern, text))
    for i, match in enumerate(matches):
        text = text.replace(match, f'__regex_match_{i}__')
    
    new_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!|\n)\s', text)

    for sentences in new_sentences:
        # print(sentences.split('\n'))
        for line in sentences.split('\n'):
            clear_token = re.findall(r'\b\w+\b|[\(\),.—:;!?|<>"]', line)
            if clear_token:
                cleaned_tokens.append(clear_token)

    for sentence in cleaned_tokens:
        for num, token in enumerate(sentence):
            for i, match in enumerate(matches):
                if token == f'__regex_match_{i}__':
                    sentence[num] = match
    return cleaned_tokens

# def processing_sentences(sentences):
#     tokens = [tokenize_text(sentence) for sentence in sentences]
#     return tokens

# tokken = ["Москва — столица Российской Федерации. Привет :) мир\n 89205198722 som@nncs.ru, kfk", "Saveley loh. Privet!!"]
# print(processing_sentences(tokken))
# processing_sentences(tokken)
# sample_file_path = '/media/space/ssd_1_tb_evo_sumsung/MishaHW/20news-bydate-test/alt.atheism/53068'
# with open(sample_file_path, 'r', encoding='latin1') as file_name:
#     sample_content = file_name.read()
# print(tokenize_text(sample_content))

In [None]:
def stemmed_tokens(tokens_list):
    stemmed_tokens_list = list()
    for tokens in tokens_list:
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_tokens_list.append(stemmed_tokens)
    return stemmed_tokens_list


In [None]:
def lemmatized_tokens(tokens_list):
    lemmatized_tokens_list = list()
    for tokens in tokens_list:
        lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
        lemmatized_tokens_list.append(lemmatized_tokens)
    return lemmatized_tokens_list


In [None]:
def save_tsv_file(tokens, stemmed_words, lemmatized_words, output_path, output_filename):
    output = f'{output_path}/{output_filename}.tsv'
    with open(output, 'w', encoding='utf-8') as output_file:
        for token_sentence, stemmed_sentence, lemmatized_sentence in zip(tokens, stemmed_words, lemmatized_words):
            for token, stemma, lemma in zip(token_sentence, stemmed_sentence, lemmatized_sentence):
                  output_file.write(f"{token}\t{stemma}\t{lemma}\n")
            output_file.write(f"\n\n")

In [None]:
from tqdm import tqdm

def processing_files(category_path, file, category_path_new):
        sample_file_path = os.path.join(category_path, file)
        with open(sample_file_path, 'r', encoding='latin1') as file_name:
            sample_content = file_name.read()
        tokens = tokenize_text(sample_content)
        lemmatized_words = lemmatized_tokens(tokens)
        stemmed_words = stemmed_tokens(tokens)

        save_tsv_file(tokens, stemmed_words, lemmatized_words, category_path_new, file)

def processing_other_folders(category_path, folder, file_, folder_new):
    files_category = os.listdir(category_path)
    folder_new = os.path.join(folder_new, file_)
    os.makedirs(f'{folder_new}')
    for file in tqdm(files_category, desc=f'Folder: {folder} | Class: {file_}'):
        if os.path.isdir(os.path.join(category_path, file)):
            print(os.path.join(category_path, file))
            processing_other_folders(os.path.join(category_path, file), folder, file, folder_new)
        if os.path.isfile(os.path.join(category_path, file)):
            processing_files(category_path, file, folder_new)

def processing_main_folders(folder, folder_new):
    folder_category = os.listdir(folder)
    for category in folder_category:
        category_path = os.path.join(folder, category)
        category_path_new = os.path.join(folder_new, category)
        os.makedirs(f'{category_path_new}')
        files_category = os.listdir(category_path)
        for file in tqdm(files_category, desc=f'Folder: {folder} | Class: {category}'):
            if os.path.isfile(os.path.join(category_path, file)):
                processing_files(category_path, file, category_path_new)
            elif os.path.isdir(os.path.join(category_path, file)):
                processing_other_folders(os.path.join(category_path, file), folder, file, category_path_new)


def processing(folders):
    for folder in folders:
        folder_new = folder.split('-')[-1]
        processing_main_folders(folder, folder_new)
        

folders = ['20news-bydate-test', '20news-bydate-train']
processing(folders)