In [None]:
# Kết nối với gg drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Đọc file CSV
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/cuoi_ky/train_model/Data/data - data.csv")
df

In [None]:
# Cài đặt thư viện và kiểm tra dữ liệu
!pip install underthesea emoji
df.info()
print(df.isnull().sum())
df["label"].unique()

In [None]:
# Loại bỏ cột thừa
df = df.drop(columns=['Unnamed: 3', 'rate'], errors='ignore')

In [None]:
from underthesea import word_tokenize
import re, emoji

# Load stopwords
stop_words_df = pd.read_csv('/content/drive/MyDrive/cuoi_ky/train_model/Data/vietnamese-stopwords-dash.txt', header=None)
stop_words = set(stop_words_df[0].values)

# Hàm xử lý câu
def process_sentences(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub(r'\d[\d\.,]*\d', '<NUMBER>', sentence)
    sentence = re.sub(r'(https?://\S+|www\.\S+)', '<URL>', sentence)
    sentence = re.sub(r'@\w+', '<USER>', sentence)
    sentence = re.sub(r'#\w+', '<HASHTAG>', sentence)
    sentence = emoji.replace_emoji(sentence, replace="<EMOJI>")
    sentence = re.sub(r'[^\wÀ-ỹ0-9<>\?\!\.,;:\- ]+', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    tokens = word_tokenize(sentence)
    tokens = [w for w in tokens if w not in stop_words and len(w)>1]
    return " ".join(tokens)

# Áp dụng tiền xử lý
df['comment'] = df['comment'].apply(process_sentences)

# Loại bỏ duplicates và lọc comment ngắn
df = df[~df['comment'].str.fullmatch(r'\d*')]
df = df.drop_duplicates(subset='comment').reset_index(drop=True)

df['label'].value_counts()
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import re

# Re-tokenize comments if needed
def tokenize(text):
    return re.findall(r'\w+', str(text).lower())

df = df.dropna(subset=['comment', 'label'])
df['tokens'] = df['comment'].apply(tokenize)
# Top 20 most frequent words across entire dataset
all_tokens = [token for tokens in df['tokens'] for token in tokens]
top_tokens = Counter(all_tokens).most_common(20)
words, counts = zip(*top_tokens)

# Bar chart Top 20 words
plt.figure(figsize=(10, 6))
sns.barplot(x=list(counts), y=list(words), palette='viridis')
plt.title('Top 20 Most Common Words (All Labels)')
plt.xlabel('Count')
plt.ylabel('Word')
plt.tight_layout()
plt.show()

# Distribution of Labels
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df, palette='Set2')
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# WordCloud for each sentiment
for label in df['label'].unique():
    label_text = ' '.join(df[df['label'] == label]['comment'])
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(label_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {label} Sentiment')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Logistic Regression , SVM , Naive Bayes


  

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Cấu hình TF-IDF nâng cao
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Train/Test split
X = df["comment"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Mô hình cập nhật
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Linear SVM": LinearSVC(class_weight="balanced"),
    "Naive Bayes": MultinomialNB()
}

# Đánh giá mô hình
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

results = {}

for name, model in models.items():
    print(f"\n====== {name} ======")
    pipeline = Pipeline([("tfidf", tfidf), ("clf", model)])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    results[name] = {
        "accuracy": acc,
        "report": report
    }

    print("Accuracy:", round(acc, 4))
    print(classification_report(y_test, y_pred, digits=4))

    cm = confusion_matrix(y_test, y_pred, labels=["NEG", "NEU", "POS"])
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["NEG", "NEU", "POS"],
                yticklabels=["NEG", "NEU", "POS"])
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Tạo bảng kết quả từ dict 'results'
score_df = pd.DataFrame({
    model: {
        "accuracy": round(metrics["accuracy"], 4),
        "precision": round(metrics["report"]["weighted avg"]["precision"], 4),
        "recall": round(metrics["report"]["weighted avg"]["recall"], 4),
        "f1": round(metrics["report"]["weighted avg"]["f1-score"], 4)
    }
    for model, metrics in results.items()
}).T

# In bảng tổng hợp
print("\n Tổng hợp kết quả mô hình:\n")
print(score_df)

# Vẽ biểu đồ Precision, Recall, F1
score_df[["precision", "recall", "f1"]].plot(
    kind="bar",
    figsize=(10, 6),
    color=["#1f77b4", "#ff7f0e", "#2ca02c"],
    edgecolor='black'
)
plt.title("So sánh mô hình: Precision - Recall - F1", fontsize=14)
plt.ylabel("Score", fontsize=12)
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()
