In [6]:
import csv
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer
import re

def preprocess_and_save_for_transformer(input_csv, output_pickle, class_column, summary_column, max_len=512):
  """
  Preprocesses text from a CSV file for a transformer model, 
  including tokenization, stemming, stop word removal, and saves to a pickle file.

  Args:
      input_csv (str): Path to the input CSV file.
      output_pickle (str): Path to the output pickle file.
      class_column (str): Name of the column containing class labels.
      summary_column (str): Name of the column containing summaries.
      max_len (int, optional): Maximum sequence length for the model. Defaults to 512.
  """

  # Load stop words and stemmer
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()

  # Initialize empty lists for tokens and labels
  tokens = []
  labels = []

  with open(input_csv, 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)

    for row in reader:
      # Get class and summary text
      label = row[class_column]
      summary_text = row[summary_column]

      # Preprocess text
      processed_text = preprocess_text(summary_text, stop_words, stemmer)

      # Tokenize using a BERT tokenizer
      tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

      # Truncate or handle long sequences (**Modification**):
      # Option 1: Truncate the beginning
      truncated_text = processed_text[:max_len]  # Truncate from the beginning

      # Option 2: Truncate the end (uncomment if preferred)
      # truncated_text = processed_text[-max_len:]  # Truncate from the end

      tokenized_text = tokenizer.encode(truncated_text, add_special_tokens=True)

      # Add tokens and label to lists
      tokens.append(tokenized_text)
      labels.append(label)

  # Save preprocessed data to pickle file
  data = {'tokens': tokens, 'labels': labels}
  with open(output_pickle, 'wb') as handle:
    import pickle
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

  print(f"Preprocessed data saved to: {output_pickle}")


def preprocess_text(text, stop_words, stemmer):
  """
  Preprocesses text for tokenization, including lowercasing, stemming, and stop word removal.

  Args:
      text (str): Text to be preprocessed.
      stop_words (set): Set of stop words to remove.
      stemmer (nltk.stem.PorterStemmer): Stemmer object for stemming words.

  Returns:
      str: Preprocessed text.
  """

  # Lowercase text
  text = text.lower()

  # Remove punctuation
  text = re.sub(r'[^\w\s]', '', text)

  # Tokenize text (split into words)
  words = text.split()

  # Remove stop words
  filtered_words = [word for word in words if word not in stop_words]

  # Apply stemming (optional)
  stemmed_words = [stemmer.stem(word) for word in filtered_words]

  # Join words back into text
  processed_text = ' '.join(stemmed_words)

  return processed_text


# Get the base filename and path
base_filename = "merged_file_after_filteration"

# Define file extensions
extensions = ["_part_1.csv", "_part_2.csv", "_part_3.csv", "_part_4.csv"]

# Loop over the files
for ext in extensions:
    input_csv = f"{base_filename}{ext}"
    output_pickle = f"preprocessed_data_{ext[:-4]}.pickle"  # Remove ".csv" from output filename
    class_column = "owner"  # Replace with the actual class column name
    summary_column = "Summary"  # Replace with the actual summary column name
    max_len = 512  # Adjust max_len if needed

    preprocess_and_save_for_transformer(input_csv, output_pickle, class_column, summary_column, max_len)

KeyboardInterrupt: 

In [1]:
import csv

file_path = 'merged_file_after_filteration_part_1.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV file:", column_names)

Column names in the CSV file: ['owner', 'Summary']
