In [45]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch


df = pd.read_csv('/content/Data_KNKH_SendoFarm_Q4.2023.xlsx - Sheet1.csv')


df_cleaned = df.dropna(subset=['Chi tiết mô tả khiếu nại', 'Nguyên nhân'])


df_sampled = df_cleaned.sample(frac=0.3, random_state=42)


X = df_sampled['Chi tiết mô tả khiếu nại']
y = df_sampled['Nguyên nhân']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text


X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)


def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_encodings = tokenize_function(X_train.tolist())
test_encodings = tokenize_function(X_test.tolist())


class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, y_train_encoded)
test_dataset = TextDataset(test_encodings, y_test_encoded)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=3
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


trainer.train()


model.save_pretrained('./model_checkpoint')
tokenizer.save_pretrained('./model_checkpoint')


model = BertForSequenceClassification.from_pretrained('./model_checkpoint')
tokenizer = BertTokenizer.from_pretrained('./model_checkpoint')


results = trainer.evaluate()
print("Evaluation results:")
print(results)

predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)

from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test_encoded, pred_labels, target_names=label_encoder.classes_))



Step,Training Loss
10,2.276
20,2.2635
30,2.2655
40,2.2178
50,2.1718
60,1.9236
70,1.9393
80,1.6119
90,1.7101
100,1.6235


Evaluation results:
{'eval_loss': 1.2153414487838745, 'eval_runtime': 1371.2223, 'eval_samples_per_second': 0.604, 'eval_steps_per_second': 0.076, 'epoch': 3.0}


KeyboardInterrupt: 

In [7]:
from google.colab import files
import shutil

# Nén các tệp vào file zip
shutil.make_archive('/content/model_files', 'zip', '/content/model_checkpoint')

# Tải file zip xuống máy tính
files.download('/content/model_files.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
import shutil

# Di chuyển các tệp mô hình vào Google Drive
shutil.copytree('/content/model_checkpoint', '/content/drive/MyDrive/model_directory')


'/content/drive/MyDrive/model_directory'