In [None]:
import csv
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer
import re

def preprocess_and_save_for_transformer(input_csv, output_pickle, class_column, summary_column, max_len=512):
  """
  Preprocesses text from a CSV file for a transformer model,
  including tokenization, stemming, stop word removal, and saves to a pickle file.

  Args:
      input_csv (str): Path to the input CSV file.
      output_pickle (str): Path to the output pickle file.
      class_column (str): Name of the column containing class labels.
      summary_column (str): Name of the column containing summaries.
      max_len (int, optional): Maximum sequence length for the model. Defaults to 512.
  """

  # Load stop words and stemmer
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()

  # Initialize empty lists for tokens and labels
  tokens = []
  labels = []

  with open(input_csv, 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)

    for row in reader:
      # Get class and summary text
      label = row[class_column]
      summary_text = row[summary_column]

      # Preprocess text
      processed_text = preprocess_text(summary_text, stop_words, stemmer)

      # Tokenize using a BERT tokenizer
      tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

      # Truncate or handle long sequences (**Modification**):
      # Option 1: Truncate the beginning
      truncated_text = processed_text[:max_len]  # Truncate from the beginning

      # Option 2: Truncate the end (uncomment if preferred)
      # truncated_text = processed_text[-max_len:]  # Truncate from the end

      tokenized_text = tokenizer.encode(truncated_text, add_special_tokens=True)

      # Add tokens and label to lists
      tokens.append(tokenized_text)
      labels.append(label)

  # Save preprocessed data to pickle file
  data = {'tokens': tokens, 'labels': labels}
  with open(output_pickle, 'wb') as handle:
    import pickle
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

  print(f"Preprocessed data saved to: {output_pickle}")


def preprocess_text(text, stop_words, stemmer):
  """
  Preprocesses text for tokenization, including lowercasing, stemming, and stop word removal.

  Args:
      text (str): Text to be preprocessed.
      stop_words (set): Set of stop words to remove.
      stemmer (nltk.stem.PorterStemmer): Stemmer object for stemming words.

  Returns:
      str: Preprocessed text.
  """

  # Lowercase text
  text = text.lower()

  # Remove punctuation
  text = re.sub(r'[^\w\s]', '', text)

  # Tokenize text (split into words)
  words = text.split()

  # Remove stop words
  filtered_words = [word for word in words if word not in stop_words]

  # Apply stemming
  stemmed_words = [stemmer.stem(word) for word in filtered_words]

  # Join words back into text
  processed_text = ' '.join(stemmed_words)

  return processed_text

directory_path = 'BugWhiz/kaggle_dataset/'
# Get the base filename and path
base_filename = "merged_file_after_filteration"

# Define file extensions
extensions = ["_part_1.csv", "_part_2.csv", "_part_3.csv", "_part_4.csv"]

# Loop over the files
for ext in extensions:
    input_csv = f"{directory_path}{base_filename}{ext}"
    output_pickle = f"preprocessed_data_{ext[:-4]}.pickle"  # Remove ".csv" from output filename
    class_column = "owner"  # Replace with the actual class column name
    summary_column = "Summary"  # Replace with the actual summary column name
    max_len = 512  # Adjust max_len if needed

    preprocess_and_save_for_transformer(input_csv, output_pickle, class_column, summary_column, max_len)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Preprocessed data saved to: preprocessed_data__part_1.pickle


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!git clone https://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@github.com/MUSTAFA-Hamzawy/BugWhiz.git -b classification


Cloning into 'BugWhiz'...
remote: Enumerating objects: 242, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 242 (delta 19), reused 39 (delta 19), pack-reused 203[K
Receiving objects: 100% (242/242), 144.60 MiB | 15.22 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (25/25), done.


In [None]:
import csv

directory_path = 'BugWhiz/kaggle_dataset'
file_path = 'merged_file_after_filteration_part_1.csv'
file_path = directory_path + "/" + file_path
# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)

    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV file:", column_names)

Column names in the CSV file: ['owner', 'Summary']


In [None]:
!ls

BugWhiz  sample_data
