In [1]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import string
import nltk
import os

# Download necessary NLTK resources
nltk.download('punkt')  # Word tokenizer
nltk.download('stopwords')  # Stopwords list for multiple languages

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Define a preprocessing function for Russian
def preprocess_russian_text(text):
    """
    Preprocesses Russian text by performing the following steps:
    1. Lowercasing
    2. Removing special characters and extra spaces
    3. Tokenizing into words
    4. Removing stopwords
    5. Removing punctuation
    Args:
        text (str): The input text to preprocess.
    Returns:
        str: The cleaned, tokenized, and normalized text.
    """
    # 1. Lowercasing
    text = text.lower()

    # 2. Removing special characters and extra spaces
    text = re.sub(r"[^а-яА-ЯёЁ0-9\s]", "", text)  # Keep Cyrillic, numbers, and spaces
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces

    # 3. Tokenizing into words
    words = word_tokenize(text, language="russian")

    # 4. Removing stopwords
    stop_words = set(stopwords.words('russian'))
    words = [word for word in words if word not in stop_words]

    # 5. Removing punctuation
    words = [word for word in words if word not in string.punctuation]

    # Join the words back into a single string
    return " ".join(words)



In [5]:
# Define a function to process a dataset
def preprocess_russian_dataset(file_path, output_path=None):
    """
    Reads a Russian text file, preprocesses each line, and optionally saves the processed dataset.
    Args:
        file_path (str): Path to the input text file.
        output_path (str): Path to save the processed file (optional).
    Returns:
        pd.DataFrame: DataFrame containing the original and preprocessed_texts texts.
    """
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")

    # Read the file
    print(f"Reading file: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Process each line
    print("Preprocessing text...")
    processed_texts = []
    for line in tqdm(lines):
        line = line.strip()  # Remove leading and trailing whitespaces
        if line:  # Skip empty lines
            processed_line = preprocess_russian_text(line)
            processed_texts.append(processed_line)

    # Create a DataFrame
    df = pd.DataFrame({
        'original_text': [line.strip() for line in lines if line.strip()],
        'processed_text': processed_texts
    })

    # Save the processed data if output_path is provided
    if output_path:
        df.to_csv(output_path, index=False)
        print(f"Processed data saved to: {output_path}")

    return df

In [16]:
def preprocess_for_generation(text):
    """
    Preprocess text for generation models by:
    1. Lowercasing
    2. Removing special characters (non-Cyrillic, non-numeric)
    3. Collapsing extra spaces
    Args:
        text (str): Input text.
    Returns:
        str: Normalized text suitable for training generation models.
    """
    text = text.lower()
    text = re.sub(r"[^а-яА-ЯёЁ0-9.,!?;\-\s]", "", text)  # Keep common punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [21]:
print("Current working directory:", os.getcwd())
print("Data files in the directory:", os.listdir("../data/Russian"))
# Path to the raw text file
input_files = ["../data/Russian/gorky.txt", "../data/Russian/tolstoy.txt", "../data/Russian/dostoevskiy.txt", "../data/Russian/bulgakov.txt", "../data/Russian/chekhov.txt"]
output_files = ["../data/Russian/gorky_preprocessed.csv", "../data/Russian/tolstoy_preprocessed.csv", "../data/Russian/dostoevskiy_preprocessed.csv", "../data/Russian/bulgakov_preprocessed.csv", "../data/Russian/chekhov_preprocessed.csv"] # Path to save the processed data

for input_file, output_file in zip(input_files, output_files):
    # Preprocess the dataset
    preprocessed_df = preprocess_russian_dataset(input_file, output_path=output_file)

    # Display a sample of the processed data
    print(preprocessed_df.head())

Current working directory: E:\Uni courses\comp550\550-final-project\notebooks
Data files in the directory: ['Bulgakov', 'bulgakov.txt', 'bulgakov_preprocessed.csv', 'Chekhov', 'chekhov.txt', 'chekhov_preprocessed.csv', 'Dostoevskiy', 'dostoevskiy.txt', 'dostoevskiy_preprocessed.csv', 'Gorky', 'gorky.txt', 'Tolstoy', 'tolstoy.txt', 'tolstoy_preprocessed.csv']
Reading file: ../data/Russian/gorky.txt
Preprocessing text...


100%|██████████| 76681/76681 [00:11<00:00, 6842.84it/s]


Processed data saved to: ../data/Russian/gorky_preprocessed.csv
                                       original_text  \
0                                Жизнь Клима Самгина   
1                                       Часть первая   
2            Посвящается Марии Игнатьевне Закревской   
3                                            Глава 1   
4  Иван Акимович Самгин любил оригинальное; поэто...   

                                      processed_text  
0                                жизнь клима самгина  
1                                       часть первая  
2            посвящается марии игнатьевне закревской  
3                                            глава 1  
4  иван акимович самгин любил оригинальное поэтом...  
Reading file: ../data/Russian/tolstoy.txt
Preprocessing text...


100%|██████████| 73550/73550 [00:13<00:00, 5286.58it/s] 


Processed data saved to: ../data/Russian/tolstoy_preprocessed.csv
                           original_text                 processed_text
0                         Лев Николаевич                 лев николаевич
1                                Толстой                        толстой
2                            Воскресение                    воскресение
3  (1889—1890, 1895—1896, 1898—1899 гг.)  18891890 18951896 18981899 гг
4           Государственное издательство   государственное издательство
Reading file: ../data/Russian/dostoevskiy.txt
Preprocessing text...


100%|██████████| 59143/59143 [00:12<00:00, 4581.14it/s] 


Processed data saved to: ../data/Russian/dostoevskiy_preprocessed.csv
                                       original_text  \
0                                         Annotation   
1  «Преступление и наказание» – гениальный роман,...   
2  Многократно экранизированный и не раз поставле...   
3                                              * * *   
4                                      Часть перваяI   

                                      processed_text  
0                                                     
1  преступление наказание гениальный роман главны...  
2  многократно экранизированный поставленный сцен...  
3                                                     
4                                       часть первая  
Reading file: ../data/Russian/bulgakov.txt
Preprocessing text...


100%|██████████| 69367/69367 [00:12<00:00, 5598.57it/s]


Processed data saved to: ../data/Russian/bulgakov_preprocessed.csv
                                      original_text  \
0                                   Михаил Булгаков   
1                                   Иван Васильевич   
2                          Комедия в трех действиях   
3                                        Действуют:   
4  З и н а и д а М и х а й л о в н а – киноактриса.   

                processed_text  
0              михаил булгаков  
1              иван васильевич  
2       комедия трех действиях  
3                    действуют  
4  з н д м х й л н киноактриса  
Reading file: ../data/Russian/chekhov.txt
Preprocessing text...


100%|██████████| 52154/52154 [00:08<00:00, 5937.77it/s]


Processed data saved to: ../data/Russian/chekhov_preprocessed.csv
                                       original_text  \
0                                         Annotation   
1  «Комната, которая до сих пор называется детско...   
2  Входят Дуняша со свечой и Лопахин с книгой в р...   
3                                              * * *   
4                        Антон ЧеховДействующие лица   

                                      processed_text  
0                                                     
1  комната которая сих пор называется детскою одн...  
2           входят дуняша свечой лопахин книгой руке  
3                                                     
4                        антон чеховдействующие лица  


In [23]:
input_files = ["../data/Russian/tolstoy.txt", "../data/Russian/dostoevskiy.txt", "../data/Russian/bulgakov.txt", "../data/Russian/chekhov.txt", "../data/Russian/gorky.txt"]
output_files = ["../data/Russian/tolstoy_preprocessed_generation.txt", "../data/Russian/dostoevskiy_preprocessed_generation.txt", "../data/Russian/bulgakov_preprocessed_generation.txt", "../data/Russian/chekhov_preprocessed_generation.txt", "../data/Russian/gorky_preprocessed_generation.txt"]

for input_file, output_file in zip(input_files, output_files):
    # Read the raw text file
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Preprocess the text for generation
    preprocessed_text = preprocess_for_generation(text)

    # Save the preprocessed_texts text
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(preprocessed_text)

    print(f"Preprocessed text saved to: {output_file}")

Preprocessed text saved to: ../data/Russian/tolstoy_preprocessed_generation.txt
Preprocessed text saved to: ../data/Russian/dostoevskiy_preprocessed_generation.txt
Preprocessed text saved to: ../data/Russian/bulgakov_preprocessed_generation.txt
Preprocessed text saved to: ../data/Russian/chekhov_preprocessed_generation.txt
Preprocessed text saved to: ../data/Russian/gorky_preprocessed_generation.txt


In [24]:
import os
import pandas as pd

def generate_csv_from_txt(data_dir, output_file):
    """
    Combines text files for each author into a single CSV file with columns:
    - 'text': The text content.
    - 'author': The name of the author.
    
    Args:
        data_dir (str): Directory containing text files for each author.
        output_file (str): Path to save the generated CSV file.
    """
    data = []

    # Iterate over text files in the directory
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            author_name = os.path.splitext(file_name)[0]  # Use file name as author name
            author_name = author_name.replace("_preprocessed_generation", " ")  # Replace underscores with spaces
            file_path = os.path.join(data_dir, file_name)

            # Read the text file
            with open(file_path, 'r', encoding='utf-8') as f:
                texts = f.readlines()  # Read each line as a separate text sample

            # Store data as tuples (text, author)
            for text in texts:
                text = text.strip()  # Remove leading/trailing whitespace
                if text:  # Skip empty lines
                    data.append((text, author_name))

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=['text', 'author'])

    # Save to CSV
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"CSV file generated: {output_file}")

# Example usage
data_dir = "../data/Russian/preprocessed_texts"  # Replace with your directory containing text files
output_file = "../data/Russian/all_tokenized_data.csv"
generate_csv_from_txt(data_dir, output_file)

CSV file generated: ../data/Russian/all_tokenized_data.csv


In [8]:
# Check the generated CSV file
output_file = "../data/Russian/all_tokenized_data.csv"
df = pd.read_csv(output_file)
# print(df.head())
print(df['author'].value_counts())
print(df['text'][0][:100])  # Display the first 500 characters of the first text

author
bulgakov        1
chekhov         1
dostoevskiy     1
gorky           1
tolstoy         1
Name: count, dtype: int64
михаил булгаков иван васильевич комедия в трех действиях действуют з и н а и д а м и х а й л о в н а


In [1]:
import pandas as pd

# Parameters
MAX_SEQ_LEN = 50  # Define sequence length

# Helper function to chunk text into smaller sequences
def chunk_text(text, max_seq_len):
    words = text.split()
    chunks = [' '.join(words[i:i + max_seq_len]) for i in range(0, len(words), max_seq_len)]
    return chunks

# Load the original dataset
data = pd.read_csv("../data/Russian/all_tokenized_data.csv")  # Adjust path as needed

# Create a new dataset with chunked texts
chunked_texts = []
chunked_authors = []

for text, author in zip(data['text'], data['author']):
    chunks = chunk_text(text, MAX_SEQ_LEN)
    chunked_texts.extend(chunks)
    chunked_authors.extend([author] * len(chunks))

# Create a new DataFrame with the chunked data
chunked_data = pd.DataFrame({'text': chunked_texts, 'author': chunked_authors})

# Save the chunked dataset to a new CSV file
chunked_data.to_csv("../data/Russian/author_data.csv", index=False)
print(f"Chunked dataset saved with {len(chunked_data)} rows.")

Chunked dataset saved with 86438 rows.


In [4]:
# Check the generated CSV file
output_file = "../data/Russian/author_data.csv"
df = pd.read_csv(output_file)
print(df['text'][0])

михаил булгаков иван васильевич комедия в трех действиях действуют з и н а и д а м и х а й л о в н а киноактриса. у л ь я н а а н д р е е в н а жена управдома бунши. ц а р и ц
