In [None]:
!pip3 install gensim --upgrade
#!pip3 install tensorflow --upgrade
!pip3 install tensorflow==2.17.0
#!pip3 install keras --upgrade
!pip3 install keras==3.6.0

In [4]:
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## **Data Preparation** 

In [5]:
# Read data from kaggle dataset
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()

emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [6]:
# data transformation
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

# split data to train and test
train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

# drop the identification column
train_data = train_data.drop(columns=['identification'])
test_data = test_data.drop(columns=['identification'])

# Merge emotion for corresponding tweet_id
train_data = train_data.merge(emotion, on='tweet_id', how='left') 

In [7]:
# print information
print(train_data.shape)
print(train_data.dtypes)
print(train_data['emotion'].value_counts())
train_data.head()

(1455563, 4)
tweet_id    object
hashtags    object
text        object
emotion     object
dtype: object
emotion
joy             516017
anticipation    248935
trust           205478
sadness         193437
disgust         139101
fear             63999
surprise         48729
anger            39867
Name: count, dtype: int64


Unnamed: 0,tweet_id,hashtags,text,emotion
0,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",anticipation
1,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",sadness
2,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
3,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,joy
4,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,anticipation


In [8]:
# print information
print(test_data.shape)
print(test_data.dtypes)
test_data.head()

(411972, 3)
tweet_id    object
hashtags    object
text        object
dtype: object


Unnamed: 0,tweet_id,hashtags,text
2,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k..."
4,0x2de201,[],"""Trust is not the same as faith. A friend is s..."
9,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...
30,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #..."
33,0x26289a,[],"In these tough times, who do YOU turn to as yo..."


## **Preprocessing**

In [9]:
# Dealing with Missing data
def check_missing_values(series):
    """
    Check missing values in a Series
    
    Args:
        series: pandas Series object
        
    Returns:
        dict: dictionary containing missing value statistics
    """
    missing_count = series.isnull().sum()
    total_count = len(series)
    missing_percentage = (missing_count / total_count) * 100
    
    return {
        'Missing Values': missing_count,
        'Total Count': total_count,
        'Missing Percentage(%)': round(missing_percentage, 2)
    }

# Check missing values for each column
missing_train = train_data.apply(check_missing_values)
missing_test = test_data.apply(check_missing_values)

print(missing_train)
print(missing_test)

tweet_id    {'Missing Values': 0, 'Total Count': 1455563, ...
hashtags    {'Missing Values': 0, 'Total Count': 1455563, ...
text        {'Missing Values': 0, 'Total Count': 1455563, ...
emotion     {'Missing Values': 0, 'Total Count': 1455563, ...
dtype: object
tweet_id    {'Missing Values': 0, 'Total Count': 411972, '...
hashtags    {'Missing Values': 0, 'Total Count': 411972, '...
text        {'Missing Values': 0, 'Total Count': 411972, '...
dtype: object


In [10]:
#Dealing with Duplicate Data
sum(train_data.duplicated('text')) 
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True) # Remove duplication = 1449182

In [11]:
pip install emoji


Note: you may need to restart the kernel to use updated packages.


In [12]:
# Clean and standardize text
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    """
    Clean and standardize text
    """
    #switch emoji to text
    text = emoji.demojize(text, delimiters=[":", ":"])
    # Convert to lowercase
    text = str(text).lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s_]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_data(df):
    """
    Preprocess both text and hashtags using the same cleaning function
    """
    # Clean text
    df['cleaned_text'] = df['text'].apply(clean_text)
    
    # Clean hashtags - convert list to string and clean
    df['cleaned_hashtags'] = df['hashtags'].apply(lambda x: clean_text(' '.join(x) if isinstance(x, list) else ''))
    
    # Combine text and hashtags
    df['combined_text'] = df.apply(lambda x: 
        x['cleaned_text'] + ' ' + x['cleaned_hashtags'], axis=1)
    
    return df

processed_train = preprocess_data(train_data)
processed_test = preprocess_data(test_data)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#Sampling Only using 10% of data 
train_data_sample = processed_train.sample(frac=0.1) # Get sample
train_data_sample.shape

(144918, 7)

## **Feature engineering**

### **BOW**

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# build analyzers (bag-of-words)
BOW_500 = CountVectorizer(max_features=500, tokenizer=nltk.word_tokenize) 

# apply analyzer to training data
BOW_500.fit(processed_train['combined_text'])
train_data_BOW_features_500 = BOW_500.transform(processed_train['combined_text'])

## check dimension
train_data_BOW_features_500.shape



(1449182, 500)

### **TF-IDF**

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用 TfidfVectorizer 生成 TF-IDF 嵌入
vectorizer = TfidfVectorizer(max_features=1000)
train_data_tfidf_matrix_1000 = vectorizer.fit_transform(processed_train['combined_text'])

## check dimension
train_data_tfidf_matrix_1000.shape

(1449182, 1000)

### **Word2Vec**

In [37]:
## check library
import gensim

## ignore warnings
import warnings
warnings.filterwarnings('ignore')

# # if you want to see the training messages, you can use it
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## the input type
processed_train['text_tokenized'] = processed_train['combined_text'].apply(lambda x: nltk.word_tokenize(x))

from gensim.models import Word2Vec

## create the training corpus
training_corpus = processed_train['text_tokenized'].values
training_corpus[:3]

## setting
vector_dim = 100
window_size = 5
min_count = 1
training_epochs = 20

## model
word2vec_model = Word2Vec(sentences=training_corpus, 
                          vector_size=vector_dim, window=window_size, 
                          min_count=min_count, epochs=training_epochs)

In [46]:
def text_to_avg_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
# 將文本轉換為向量
processed_train['text_tokenized'] = processed_train['combined_text'].apply(lambda x: text_to_avg_vector(x, word2vec_model))
X = np.array(processed_train['text_tokenized'].tolist())

# **Model**

## **Naive Bayes**

#### BOW 10%sample: Mean F1-Score: 0.2759809569012269  

In [28]:
#Prepare train data and one hot encoding_
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# for a classificaiton problem, you need to provide both training & testing data
X_train_data = train_data_BOW_features_500
y_train_data = processed_train['emotion']

le = LabelEncoder() # Label target
y_train_le = le.fit_transform(y_train_data)

# Split training and testing data for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_le, test_size=0.2, random_state=42, stratify=y_train_le) 

#check dimension is a good habbit 
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)


X_train.shape:  (1159345, 500)
y_train.shape:  (1159345,)
X_test.shape:  (289837, 500)
y_test.shape:  (289837,)


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 訓練 Naive Bayes 模型
model = MultinomialNB()
model.fit(X_train, y_train)

# 預測測試集
y_pred = model.predict(X_test)

In [31]:
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) 

## precision, recall, f1-score,
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='macro')
print("Mean F1-Score:", f1)

              precision    recall  f1-score   support

           0       0.15      0.07      0.10      7916
           1       0.46      0.43      0.44     49705
           2       0.29      0.25      0.27     27785
           3       0.21      0.12      0.16     12740
           4       0.47      0.65      0.55    102636
           5       0.31      0.33      0.32     38610
           6       0.28      0.07      0.12      9542
           7       0.36      0.20      0.25     40903

    accuracy                           0.41    289837
   macro avg       0.32      0.27      0.28    289837
weighted avg       0.39      0.41      0.39    289837

Mean F1-Score: 0.2759809569012269


#### BOW whole train dataset: Accuracy: 0.41043334253381175  
Kaggle score:0.31649 (The best)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# for a classificaiton problem, you need to provide both training & testing data
X_train_data = train_data_BOW_features_500
y_train_data = train_data_sample['emotion']

le = LabelEncoder() # Label target
y_train_le = le.fit_transform(y_train_data)

# Split training and testing data for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_le, test_size=0.3, random_state=42, stratify=y_train_le) 

#check dimension is a good habbit 
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# predict test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) # Evaluation

In [None]:
# apply analyzer to training data
test_data_BOW_features_500 = BOW_500.transform(processed_test['combined_text'])
X_test_data = test_data_BOW_features_500
y_pred = model.predict(X_test_data)
# Inverse predict labels back to adjective words
y_pred_labels = le.inverse_transform(y_pred) 

#### TF-IDF Whole train dataset: Mean F1-Score: 0.2609319258751323  
Kaggle score: 0.18776

In [32]:
#Prepare train data and one hot encoding_TF-IDF: Mean F1-Score: 0.2609319258751323
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# for a classificaiton problem, you need to provide both training & testing data
X_train_data = train_data_tfidf_matrix_1000
y_train_data = processed_train['emotion']

le = LabelEncoder() # Label target
y_train_le = le.fit_transform(y_train_data)

# Split training and testing data for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_le, test_size=0.2, random_state=42, stratify=y_train_le) 

#check dimension is a good habbit 
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

X_train.shape:  (1159345, 1000)
y_train.shape:  (1159345,)
X_test.shape:  (289837, 1000)
y_test.shape:  (289837,)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# predict test set
y_pred = model.predict(X_test)

In [35]:
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) 

## precision, recall, f1-score,
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='macro')
print("Mean F1-Score:", f1)

              precision    recall  f1-score   support

           0       0.52      0.00      0.00      7916
           1       0.59      0.37      0.46     49705
           2       0.50      0.13      0.21     27785
           3       0.79      0.16      0.26     12740
           4       0.42      0.90      0.57    102636
           5       0.43      0.26      0.33     38610
           6       0.74      0.06      0.11      9542
           7       0.64      0.09      0.15     40903

    accuracy                           0.45    289837
   macro avg       0.58      0.25      0.26    289837
weighted avg       0.52      0.45      0.38    289837

Mean F1-Score: 0.2609319258751323


#### TF-IDF whole train dataset, under sampling: Mean F1-Score: 0.35301556270571005  
Kaggle score: 0.12878 (有嚴重的overfitting問題)  


#### TF-IDF whole train dataset, over sampling: Mean F1-Score: 0.3590395043436134  
Kaggle score: 0.12619 (有嚴重的overfitting問題)


In [None]:
# TF-IDF 1000 features
vectorizer = TfidfVectorizer(max_features=1000)
train_data_tfidf_matrix_1000 = vectorizer.fit_transform(processed_train['combined_text'])

In [None]:
# one hot encoding
le = LabelEncoder()
y_train_data = processed_train['emotion']
y_train_le = le.fit_transform(y_train_data)

In [None]:
# under sampling for imbalanced data
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(train_data_tfidf_matrix_1000, y_train_le)

# divede data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [None]:
# train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# predict test set
y_pred = model.predict(X_test)

# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) 

## precision, recall, f1-score,
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='macro')
print("Mean F1-Score:", f1)

In [None]:
# apply analyzer to training data
test_data_tfidf_matrix_1000 = vectorizer.fit_transform(processed_test['combined_text'])
# for a classificaiton problem, you need to provide both training & testing data
X_test_data = test_data_tfidf_matrix_1000
# predict test set
y_pred = model.predict(X_test_data)

# Inverse predict labels back to adjective words
y_pred_labels = le.inverse_transform(y_pred) 

submission = pd.DataFrame({
    'id': processed_test['tweet_id'],
    'emotion': y_pred_labels
})

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)

## **RandomForest**

#### BOW Mean F1-Score: 0.3396583541292876

In [None]:
# 訓練RandomForestClassifier模型 
# BOW Mean F1-Score: 0.3396583541292876
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 預測
y_pred = rf_model.predict(X_test)

# 評估
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
print("Mean F1-Score:", f1_score(y_test, y_pred, average='macro'))

Word2Vec Mean F1-Score: 0.17355334788122762
Kaggle score: 0.18184

In [None]:
#Prepare train data and one hot encoding_Word2vec: Mean F1-Score: 0.19585560764574117 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# for a classificaiton problem, you need to provide both training & testing data
X_train_data = X
y_train_data = train_data_sample['emotion']

le = LabelEncoder () # Label target
y_train_le = le.fit_transform(y_train_data)

# Split training and testing data for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_le, test_size=0.2, random_state=42, stratify=y_train_le) 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 訓練模型
model = RandomForestClassifier()
model.fit(X_train, y_train)

# 預測測試集
y_pred = model.predict(X_test)

In [None]:
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) 

## precision, recall, f1-score,
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='macro')
print("Mean F1-Score:", f1)

## **KNN**

#### BOW 10% sampling Mean F1-Score: 0.2920292796144408


In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("KNN Model")
print(accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))
print("Mean F1-Score:", f1_score(y_test, y_pred_knn, average='macro'))

## **testing data**

In [None]:
# apply analyzer to training data
test_data_BOW_features_500 = BOW_500.transform(processed_test['combined_text'])
# for a classificaiton problem, you need to provide both training & testing data
X_test_data = test_data_BOW_features_500
y_pred = model.predict(X_test_data)

In [None]:
# Inverse predict labels back to adjective words
y_pred_labels = le.inverse_transform(y_pred) 

## **Submission**

In [None]:
submission = pd.DataFrame({
    'id': processed_test['tweet_id'],
    'emotion': y_pred_labels
})

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
submission