# Prprocess Dataset: text to embedding
- ref: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [69]:
# import package
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, BertModel, BertTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm

In [70]:
# set random seed
np.random.seed(42)

## Dataset

In [71]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

Training data shape: (4986, 2)
                                                text  label
0  Get the latest from TODAY Sign up for our news...      1
1  2d  Conan On The Funeral Trump Will Be Invited...      1
2  It’s safe to say that Instagram Stories has fa...      0
3  Much like a certain Amazon goddess with a lass...      0
4  At a time when the perfect outfit is just one ...      0
Testing data shape: (1247, 2)
   id                                               text
0   2  The 2017 Teen Choice Awards ceremony was held ...
1   3  The concert, part of “The Joshua Tree Tour,” w...
2   4  Selena Gomez refuses to talk to her mother abo...
3   5  This is worse than a lump of coal in your stoc...
4   6  Luann De Lesseps is going to rehab after her a...


In [72]:
# EDA

# check NaN values
print(f"Null values in training data:")
print(train_df.isnull().sum())
# print unique labels
print(f"Unique labels in training data:")
print(train_df['label'].unique())
# find the row that label == 'label'
print(f"Rows with label 'label':")
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

Null values in training data:
text     0
label    0
dtype: int64
Unique labels in training data:
[1 0]
Rows with label 'label':
Empty DataFrame
Columns: [text, label]
Index: []


In [73]:
# reset the index
train_df.reset_index(drop=True, inplace=True)

# info
print(f"train info:")
print(train_df.info())
print(train_df.head())

train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986 entries, 0 to 4985
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4986 non-null   object
 1   label   4986 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.0+ KB
None
                                                text  label
0  Get the latest from TODAY Sign up for our news...      1
1  2d  Conan On The Funeral Trump Will Be Invited...      1
2  It’s safe to say that Instagram Stories has fa...      0
3  Much like a certain Amazon goddess with a lass...      0
4  At a time when the perfect outfit is just one ...      0


In [74]:
print(f"test info:")
print(test_df.info())
print(test_df.head())

test info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247 entries, 0 to 1246
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1247 non-null   int64 
 1   text    1247 non-null   object
dtypes: int64(1), object(1)
memory usage: 19.6+ KB
None
   id                                               text
0   2  The 2017 Teen Choice Awards ceremony was held ...
1   3  The concert, part of “The Joshua Tree Tour,” w...
2   4  Selena Gomez refuses to talk to her mother abo...
3   5  This is worse than a lump of coal in your stoc...
4   6  Luann De Lesseps is going to rehab after her a...


In [75]:
# train validation split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

(3988, 2)
(998, 2)
                                                   text  label
2414  Singer Aaron Carter, who has previously been f...      0
3158  Nineteen years ago, a gay man and his straight...      0
3008  The mother! of all relationships is over.  Jen...      1
3611  The Republican war on women continues unabated...      1
4517  As Taylor Swift calls out the haters on her ne...      0
                                                   text  label
1489  George Timothy Clooney (born May 6, 1961) is a...      1
2754  Do you feel it in your fingers? Do you feel it...      0
465   Advertisement  The royal family gathered this ...      0
2488  Roger Ailes, Former Fox News CEO, Dies At 77  ...      0
676   American serial child sexual abuser and physic...      0


## Tokenizer
At this stage, we transform the text data into embeddings to later feed into the model. 

We choose the `distilbert-base-uncased` model on Hugging Face for this task.

In [76]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [77]:
# tokenize the text
train_encodings = tokenizer(train_df['text'].tolist(), max_length=512, truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), max_length=512, truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].tolist(), max_length=512, truncation=True, padding=True)

In [78]:
# After tokenization, the texts are converted to input IDs and attention masks
print(train_encodings.keys())

dict_keys(['input_ids', 'attention_mask'])


In [79]:
print(train_encodings['input_ids'][0])

print(train_encodings['attention_mask'][0])

[101, 3220, 7158, 5708, 1010, 2040, 2038, 3130, 2042, 16875, 2055, 2010, 9415, 6905, 1998, 5983, 8761, 1010, 2003, 2085, 3098, 2039, 2055, 2010, 13798, 1012, 1996, 2756, 1011, 2095, 1011, 2214, 2567, 1997, 10457, 13334, 2102, 3337, 2632, 2819, 4172, 5708, 1056, 28394, 3064, 2006, 5095, 2305, 1037, 2146, 2330, 3661, 1999, 2029, 2002, 28049, 2010, 8432, 2000, 2119, 2273, 1998, 2308, 2144, 2002, 2001, 2410, 1012, 1000, 2045, 1521, 1055, 2242, 1045, 1521, 1040, 2066, 2000, 2360, 2008, 1045, 2514, 2003, 2590, 2005, 2870, 1998, 2026, 4767, 2008, 2038, 2042, 15243, 2006, 2026, 3108, 2005, 3053, 2431, 1997, 2026, 2166, 1010, 1000, 2002, 2626, 1012, 1000, 2023, 2987, 1521, 1056, 3288, 2033, 9467, 1010, 2074, 1037, 3635, 1998, 10859, 1045, 2031, 2218, 3031, 2005, 1037, 2146, 2051, 2008, 1045, 2052, 2066, 4196, 2125, 2033, 1012, 1000, 2002, 7607, 1010, 1000, 1045, 3473, 2039, 1999, 2023, 4024, 3068, 2012, 1037, 2200, 2402, 2287, 1998, 2043, 1045, 2001, 2105, 2410, 2086, 2214, 1045, 2318, 2000, 24

# Generate Embeddings

In [80]:
model = BertModel.from_pretrained("distilbert/distilbert-base-uncased")

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Some weights of BertModel were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias',

In [81]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# move the model to the device
model = model.to(device)

In [89]:
# create a dataset class
class TokenDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = self.encodings['input_ids'][idx]
        attention_mask = self.encodings['attention_mask'][idx]
        labels = self.labels[idx]
        return input_ids, attention_mask, labels

    def __len__(self):
        return len(self.labels)

In [90]:
# create dataset objects
train_dataset = TokenDataset(train_encodings, train_df['label'])
val_dataset = TokenDataset(val_encodings, val_df['label'])
test_dataset = TokenDataset(test_encodings, np.zeros(test_df.shape[0]))

In [95]:
# create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [98]:
model.eval()

with torch.no_grad():
    for thing in train_loader:
        print(thing)
        # input_ids = input_ids.to(device)
        # attention_mask = attention_mask.to(device)
        # labels = labels.to(device)
        # outputs = model(input_ids, attention_mask=attention_mask)
        # print(outputs[0].shape)
        # break

KeyError: 3339

# Write out embedding to csv

In [None]:
train_embedding_df = train_df.copy()
val_embedding_df = val_df.copy()
test_embedding_df = test_df.copy()

# rename the column 'text' to 'embeddings'
train_embedding_df = train_embedding_df.rename(columns={'text': 'embedding'})
val_embedding_df = val_embedding_df.rename(columns={'text': 'embedding'})
test_embedding_df = test_embedding_df.rename(columns={'text': 'embedding'})


train_embedding_df['embedding'] = train_encodings['input_ids']
val_embedding_df['embedding'] = val_encodings['input_ids']
test_embedding_df['embedding'] = test_encodings['input_ids']

print(train_embedding_df.head())
# print(val_embedding_df.head())
# print(test_embedding_df.head())

In [None]:
# Export the dataframes to csv files
train_embedding_df.to_csv('dataset/train_embedding.csv', sep='\t', index=False)
val_embedding_df.to_csv('dataset/val_embedding.csv', sep='\t', index=False)
test_embedding_df.to_csv('dataset/test_embedding.csv', sep='\t', index=False)

# Import dataset for future use

In [None]:
import ast

# read
sample = pd.read_csv('dataset/train_embedding.csv', sep='\t', encoding='utf-8')
print(sample.head())

print(sample['embedding'][0])   
print(type(sample['embedding'][0])) # string

# convert the embeddings to list
sample['embedding'] = sample['embedding'].apply(lambda x: ast.literal_eval(x))

# convert the embeddings to list of integers
sample['embedding'] = sample['embedding'].apply(lambda x: list(map(int, x)))

print(sample['embedding'][0])
print(type(sample['embedding'][0])) # list of integers
