In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
pd.set_option('display.width', None)  # Auto-adjust width
pd.set_option('display.max_colwidth', None)  # No limit on column width

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
#df_train = pd.read_csv("/kaggle/input/twitter-hate-speech/train_E6oV3lV.csv")

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/datasets_project/train_E6oV3lV.csv"
, header=0)
df_train.head(20)

In [None]:
df_train[df_train["label"] == 0]["tweet"].iloc[2]

In [None]:
df_train.shape

In [None]:
df_train[df_train["label"] == 0].count()


In [None]:
#df.query('label == 0').count()
len(df_train[df_train["label"] == 0])

In [None]:
len(df_train[df_train["label"] == 1])

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/datasets_project/test_tweets_anuFYb8.csv")

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(6,6))
ax = sns.countplot(data=df_train, x="label", palette='viridis')
for p in ax.patches:
    ax.annotate(f'{p.get_height()}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center',
                fontsize=12, color='black',
                xytext=(0, 5),  # Adjust text position (optional)
                textcoords='offset points')

# Show the plot
plt.show()

In [None]:
df_train.describe()

In [None]:
df_train.groupby('label').describe()

In [None]:
df_train.head(20)

## Cleanse Data

In [None]:
import re

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text) # remove URLs
    text = re.sub(r"@\w+", "", text) #remove mentions
    text = re.sub(r"#\w+", "", text) #remove hashtags
    #text = re.sub(r"[^\w\s]", "", text) #remove special characters
    return text.lower().strip()

In [None]:
df_train['tweet'][3]

In [None]:
df_train['cleaned_tweet']= df_train['tweet'].apply(clean_text)

In [None]:
df_train.head(20)

In [None]:
def decode_text(text):
    try:
        #Handling the characters's error
        return text.encode('latin1').decode('utf-8', errors='ignore')
    except UnicodeDecodeError:
        return text

In [None]:
def contains_invalid_encoding(text):
    # Check if text is a string and find invalid characters
    if isinstance(text, str):
        # Regex to find characters that are not valid Unicode
        return bool(re.search(r'[^\x00-\x7F]+', text))  # Find characters outside the standard ASCII range
    return False

In [None]:
# Ensure the 'tweet' column is of string type before applying the check function
df_train['cleaned_tweet'] = df_train['cleaned_tweet'].astype(str)

df_train['decoded_tweet'] = df_train['cleaned_tweet'].apply(lambda x: decode_text(x) if contains_invalid_encoding(x) else x)

# Filter rows containing incorrectly encoded characters (outside the ASCII range)
invalid_tweets = df_train[df_train['cleaned_tweet'].apply(contains_invalid_encoding)]

invalid_tweets_after_decoding = df_train[df_train['cleaned_tweet'] != df_train['decoded_tweet']]

print("Tweets containing incorrectly encoded characters (outside the ASCII range):")
print(invalid_tweets[['cleaned_tweet']])

In [None]:
print(f"Decoded tweets:")
print(invalid_tweets_after_decoding[['cleaned_tweet', 'decoded_tweet']])

In [None]:
repeated_char_train = []
for i, tweet in enumerate(df_train['decoded_tweet']):
  if re.match(r'\w*(\w)\1+',tweet):
    repeated_char_train.append(i)

In [None]:
len(repeated_char_train)

In [None]:
#Repeated char function
def repeated_char(text):
  text = re.sub(r'(\w)\1{2,}',r'\1',text)
  return text

In [None]:
df_train['clean_duplicate']=df_train['decoded_tweet'].apply(repeated_char)

In [None]:
df_train.head(20)

## Emoji Processing

In [None]:
!pip install emoji
import emoji
def emoji_text_trans(text):
    text= emoji.demojize(text).replace (':', ' ')

    #delete repeated emoji
    tokenize= text.split()
    repeated_list=[]
    for word in tokenize:
        if word not in repeated_list:
            repeated_list.append(word)

    text=' '.join(word for word in repeated_list)
    text= text.replace("_", " ").replace("-", " ")
    return text

In [None]:
df_train["deemoji_tweet"] = df_train["clean_duplicate"].apply(emoji_text_trans)

In [None]:
from sklearn.model_selection import train_test_split
#label
y= df_train.label
#features
x=df_train.deemoji_tweet
#split into test and train dataset with test size 20%
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_train.head()

## Cleansing df_test

## Distill BERT

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Initial Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization and Tensor Creation
texts = x_train.tolist()
labels = y_train.tolist()
encoded_inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)
input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']
labels = torch.tensor(labels)


In [None]:
# Dataset y DataLoader
dataset = TensorDataset(input_ids, attention_mask, labels)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

# Class Weighting to Handle Imbalance
#class_weights = compute_class_weight('balanced', classes=np.unique(labels.numpy()), y=labels.numpy())
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=labels.numpy())
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
# Training
epochs = 5
training_loss = []
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in train_loader:
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        optimizer.zero_grad()
        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )
        #loss = outputs.loss
        loss = criterion(outputs.logits, b_labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    training_loss.append(avg_loss)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    save_path = "/content/drive/MyDrive/datasets_project"
    # Save the Model After Each Epoch
    model.save_pretrained(os.path.join(save_path, f"distilbert_model_epoch_{epoch + 1}"))
    tokenizer.save_pretrained(os.path.join(save_path, f"distilbert_model_epoch_{epoch + 1}"))

In [None]:
# test_dataset = TensorDataset(input_ids, attention_mask)
# test_loader = DataLoader(test_dataset, batch_size=8)

# # Perform Batch Inference
# y_pred = []
# model.eval()
# with torch.no_grad():
#     for batch in test_loader:
#         b_input_ids, b_attention_mask = tuple(t.to(device) for t in batch)
#         outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
#         logits = outputs.logits
#         y_pred.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [None]:
texts_test = x_test.tolist()  # Dữ liệu văn bản test
labels_test = y_test.tolist()  # Nhãn test

encoded_test_inputs = tokenizer(
    texts_test,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

test_input_ids = encoded_test_inputs['input_ids']
test_attention_mask = encoded_test_inputs['attention_mask']
test_labels = torch.tensor(labels_test)

# Tạo DataLoader cho tập test
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Đánh giá mô hình
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_attention_mask
        )
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        correct += (predictions == b_labels).sum().item()
        total += b_labels.size(0)

accuracy = correct / total
print(f"Accuracy on test set: {accuracy:.4f}")

In [None]:
# # Đánh giá mô hình
# model.eval()
# correct_0 = 0  # Dự đoán đúng cho nhãn 0
# correct_1 = 0  # Dự đoán đúng cho nhãn 1
# total_0 = 0    # Tổng số mẫu có nhãn 0
# total_1 = 0    # Tổng số mẫu có nhãn 1

# with torch.no_grad():
#     for batch in test_loader:
#         b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

#         outputs = model(
#             input_ids=b_input_ids,
#             attention_mask=b_attention_mask
#         )
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)

#         # Tính số dự đoán đúng cho nhãn 0 và 1
#         correct_0 += ((predictions == 0) & (b_labels == 0)).sum().item()
#         correct_1 += ((predictions == 1) & (b_labels == 1)).sum().item()

#         # Tính tổng số mẫu cho nhãn 0 và 1
#         total_0 += (b_labels == 0).sum().item()
#         total_1 += (b_labels == 1).sum().item()

# # Tính accuracy cho từng nhãn
# accuracy_0 = correct_0 / total_0 if total_0 > 0 else 0
# accuracy_1 = correct_1 / total_1 if total_1 > 0 else 0

# print(f"Accuracy for label 0: {accuracy_0:.4f}")
# print(f"Accuracy for label 1: {accuracy_1:.4f}")


In [None]:
from sklearn.metrics import classification_report

# Đánh giá mô hình
model.eval()
y_true = []  # Nhãn thực tế
y_pred = []  # Dự đoán của mô hình

with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_attention_mask
        )
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        y_true.extend(b_labels.cpu().numpy())  # Lưu nhãn thực tế
        y_pred.extend(predictions.cpu().numpy())  # Lưu dự đoán

# In ra classification report
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=['No Hate', 'Hate']))


In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Hate', 'Hate'], yticklabels=['No Hate', 'Hate'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Loss Visualization
plt.figure(figsize=(8, 6))
plt.plot(range(1, epochs + 1), training_loss, marker='o')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
model.save_pretrained("/kaggle/working/optimized_distilbert")
tokenizer.save_pretrained("/kaggle/working/optimized_distilbert")

In [None]:
import shutil

# Nén thư mục mô hình thành file zip
shutil.make_archive('/kaggle/working/optimized_distilbert.zip', 'zip', '/kaggle/working', 'optimized_distilbert')

In [None]:
from IPython.display import FileLink
FileLink(r'optimized_distilbert.zip.zip')