# Pipeline 3: Text Classification with plain transformers
- ref: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
# import package
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.model_selection import train_test_split

In [None]:
# set random seed
np.random.seed(42)

## Dataset

In [None]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

In [None]:
# EDA

# check NaN values
print(f"Null values in training data:")
print(train_df.isnull().sum())
# print unique labels
print(f"Unique labels in training data:")
print(train_df['label'].unique())
# find the row that label == 'label'
print(f"Rows with label 'label':")
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

In [None]:
# train validation split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

## Tokenizer
At this stage, we transform the text data into embeddings to later feed into the model. 

We choose the `distilbert-base-uncased` model on Hugging Face for this task.

In [None]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
# tokenize the text
train_encodings = tokenizer(train_df['text'].tolist(), max_length=512, truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), max_length=512, truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].tolist(), max_length=512, truncation=True, padding=True)
print(train_encodings)
print(val_encodings)
print(test_encodings)

In [None]:
# After tokenization, the texts are converted to input IDs and asampleention masks
print(train_encodings.keys())

In [None]:
# we transform the text content into embeddings
for embedding in train_encodings['input_ids']:
    print(embedding)

# we don't need the asampleention mask
# for asampleention_mask in train_encodings['asampleention_mask']:
#     print(asampleention_mask)

# Write out embedding to csv

In [None]:
print(type(train_encodings['input_ids']))

In [None]:
train_embedding_df = train_df.copy()
val_embedding_df = val_df.copy()
test_embedding_df = test_df.copy()

# rename the column 'text' to 'embeddings'
train_embedding_df = train_embedding_df.rename(columns={'text': 'embedding'})
val_embedding_df = val_embedding_df.rename(columns={'text': 'embedding'})
test_embedding_df = test_embedding_df.rename(columns={'text': 'embedding'})


train_embedding_df['embedding'] = train_encodings['input_ids']
val_embedding_df['embedding'] = val_encodings['input_ids']
test_embedding_df['embedding'] = test_encodings['input_ids']

print(train_embedding_df.head())
# print(val_embedding_df.head())
# print(test_embedding_df.head())

In [None]:
# Export the dataframes to csv files
train_embedding_df.to_csv('dataset/train_embedding.csv', sep='\t', index=False)
val_embedding_df.to_csv('dataset/val_embedding.csv', sep='\t', index=False)
test_embedding_df.to_csv('dataset/test_embedding.csv', sep='\t', index=False)

# Import dataset for future use

In [None]:
import ast

# read
sample = pd.read_csv('dataset/train_embedding.csv', sep='\t', encoding='utf-8')
print(sample.head())

print(sample['embedding'][0])   
print(type(sample['embedding'][0])) # string

# convert the embeddings to list
sample['embedding'] = sample['embedding'].apply(lambda x: ast.literal_eval(x))

# convert the embeddings to list of integers
sample['embedding'] = sample['embedding'].apply(lambda x: list(map(int, x)))

print(sample['embedding'][0])
print(type(sample['embedding'][0])) # list of integers
