# Text Representation

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

import joblib


## Setup

In [2]:
# Download the saved model from Google Drive
!gdown --id 1JQWAMjNb-iCm8nQUZeo5B2VrmLcjEk6Q -O preprocessed.zip

# Unzip the ZIP file
import zipfile

# Extract the contents into the current directory
with zipfile.ZipFile("preprocessed.zip", 'r') as zip_ref:
    zip_ref.extractall()  # Extracts into the current working directory

print("Files successfully extracted into the current directory.")


Downloading...
From (original): https://drive.google.com/uc?id=1JQWAMjNb-iCm8nQUZeo5B2VrmLcjEk6Q
From (redirected): https://drive.google.com/uc?id=1JQWAMjNb-iCm8nQUZeo5B2VrmLcjEk6Q&confirm=t&uuid=3dd9a0ae-da15-4d89-9c7d-ba3d172a2ae3
To: /content/preprocessed.zip
100% 63.2M/63.2M [00:03<00:00, 17.7MB/s]
Files successfully extracted into the current directory.


## Load Data

In [None]:
# Load the training dataset
train = pd.read_csv("train_processed.csv", na_filter=False)

# Load the test dataset
test = pd.read_csv("test_processed.csv", na_filter=False)


## Representation

#### Tokenize with BertTokenizer

In [None]:
# Install Hugging Face Transformers
!pip install transformers

from transformers import DistilBertTokenizer

# 1. Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# 2. Tokenize the dataset
def tokenize_texts(texts, max_length=128):
    """
    Tokenize a list of texts and return input IDs and attention masks.

    Args:
    - texts (list of str): List of text samples.
    - max_length (int): Maximum length of the tokenized sequences.

    Returns:
    - dict: A dictionary with 'input_ids' and 'attention_mask'.
    """
    return tokenizer(
        texts,
        padding="max_length",  # Pad to max_length
        truncation=True,       # Truncate sequences longer than max_length
        max_length=max_length, # Define max token length
        return_tensors="pt",   # Return PyTorch tensors
    )

tokenized_data = tokenize_texts(train['minimal_cleaned_comment_text'].tolist(), max_length=128)
toknized_data_test = tokenize_texts(test['minimal_cleaned_comment_text'].tolist(), max_length=128)
# 3. Inspect the tokenized output
print("Example tokenized input IDs:")
print(tokenized_data['input_ids'][:2])

print("\nExample attention mask:")
print(tokenized_data['attention_mask'][:2])





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Example tokenized input IDs:
tensor([[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
         18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
          1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
          3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
          1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
          1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
          6486,  1012, 16327,  1012,  4229,  1012,  2676,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0, 

#### Save Representation

In [None]:
import torch

# Save tokenized data as PyTorch tensors
torch.save({
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
}, 'tokenized_train_data.pt')

print("Tokenized data saved to /content/tokenized_train_data.pt")


Tokenized data saved to /content/tokenized_train_data.pt


In [None]:
import torch

# Save tokenized data as PyTorch tensors
torch.save({
    'input_ids': toknized_data_test['input_ids'],
    'attention_mask': toknized_data_test['attention_mask'],
}, 'tokenized_test_data.pt')

print("Tokenized data saved to /content/tokenized_test_data.pt")


Tokenized data saved to /content/tokenized_test_data.pt


### TFIDFVectorizer

In [None]:
# Configure the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000)

# Fit the TF-IDF vectorizer on the training dataset and transform the texts
X_train = vectorizer.fit_transform(train['cleaned_comment_text'])
X_test = vectorizer.transform(test['cleaned_comment_text'])

# Extract the labels
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
y_test = test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values


In [None]:
joblib.dump(X_train, "X_train.joblib")
joblib.dump(X_test, "X_test.joblib")
joblib.dump(y_train, "y_train.joblib")
joblib.dump(y_test, "y_test.joblib")


['y_test.joblib']