# Using machine learning for sentiment analysis
The goal of this project is to train a Model for Text Sentiment Classification.

## Data Preprocess & Cleansing

In [5]:
import json
import pandas as pd

df_raw = pd.read_json('dm-lab-2-private-competition/final_posts.json')

print(df_raw.iloc[0,0])
df = pd.DataFrame()
df["post_id"] = df_raw["root"].apply(lambda x: x["_source"]["post"]["post_id"])
df["text"]    = df_raw["root"].apply(lambda x: x["_source"]["post"]["text"])
df["hashtags"] = df_raw["root"].apply(lambda x: x["_source"]["post"]["hashtags"])

{'_type': 'post', '_source': {'post': {'post_id': '0x61fc95', 'text': 'We got the ranch, loaded our guns and sat up till sunrise.', 'hashtags': []}}}


In [6]:
ident = pd.read_csv('dm-lab-2-private-competition/data_identification.csv')
emotion = pd.read_csv('dm-lab-2-private-competition/emotion.csv')
emotion['post_id'] = emotion['id']
df = df.merge(emotion[['post_id', 'emotion']], on='post_id', how='left')
df['ident'] = ident['split']
df

Unnamed: 0,post_id,text,hashtags,emotion,ident
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[],,test
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
4,0xaba820,and that got my head bobbing a little bit.,[],,test
...,...,...,...,...,...
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[],anger,train
64167,0xf5ba78,One of my favorite episodes.,[],joy,train
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,[],,test
64169,0xb5a35a,Texans and Astros both shut out tonight. Houst...,"[texans, astros, sadness, losers]",sadness,train


In [None]:
train_df = df[df['ident'] == 'train']
test_df =  df[df['ident'] == 'test']

Unnamed: 0,post_id,text,hashtags,emotion,ident
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
7,0x2ffb63,Thank you so much❤️,[],joy,train
9,0x989146,Stinks because ive been in this program for a ...,[],joy,train
...,...,...,...,...,...
64164,0xd740f2,why is everybody seem sp serious?,[],joy,train
64165,0x99267e,"You can cross fuck off, its 10f all winter in ...",[],anger,train
64166,0x4afbe1,Guilty Gear actually did that before with Guil...,[],anger,train
64167,0xf5ba78,One of my favorite episodes.,[],joy,train


In [8]:
train_df.head()

Unnamed: 0,post_id,text,hashtags,emotion,ident
1,0x35663e,I bet there is an army of married couples who ...,[],joy,train
2,0xc78afe,This could only end badly.,[],fear,train
3,0x90089c,My sister squeezed a lime in her milk when she...,[],joy,train
7,0x2ffb63,Thank you so much❤️,[],joy,train
9,0x989146,Stinks because ive been in this program for a ...,[],joy,train


## Text Preprocessing
Text Preprocessing is traditionally an important step for Natural Language Processing (NLP) tasks. 

It transforms text into a more digestible form so that deep learning algorithms can perform better.

The Preprocessing steps taken are:

1. Lower Casing: Each text is converted to lowercase.

2. Replacing URLs: Links starting with 'http' or 'https' or 'www' are replaced by '<url>'.

3. Replacing Usernames: Replace @Usernames with word '<user>'. [eg: '@Kaggle' to '<user>'].

4. Replacing Consecutive letters: 3 or more consecutive letters are replaced by 2 letters. [eg: 'Heyyyy' to 'Heyy']

5. Replacing Emojis: Replace emojis by using a regex expression. [eg: ':)' to '<smile>']

6. Replacing Contractions: Replacing contractions with their meanings. [eg: "can't" to 'can not']

7. Removing Non-Alphabets: Replacing characters except Digits, Alphabets and pre-defined Symbols with a space.

In [12]:
import re
contractions = pd.read_csv('dm-lab-2-private-competition/contractions.csv.xls', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

train_df['processed_text'] = train_df.text.apply(preprocess_apply)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['processed_text'] = train_df.text.apply(preprocess_apply)


In [14]:
# the texts now turn into this
count=0
for row in train_df.itertuples():
    print("Text:", row[2])
    print('processed_text:', row[4],"\n")
    count+=1
    if count>10:
        break

    

Text: I bet there is an army of married couples who did the same exact thing.
processed_text: joy 

Text: This could only end badly.
processed_text: fear 

Text: My sister squeezed a lime in her milk when she was 12. Same thing happened, but we told her it would happen AFTER she did it ..
processed_text: joy 

Text: Thank you so much❤️
processed_text: joy 

Text: Stinks because ive been in this program for a year with no pay.....back to the drawing board.
processed_text: joy 

Text: The overall response is try and empower women, abolish prostitution and stop giving lazy men money because they want to live out their idiotic fantasy lives. 
processed_text: anger 

Text: Your market sucks
processed_text: anger 

Text: here’s hoping the same is true for me!
processed_text: joy 

Text: She looks like a televangelist.
processed_text: joy 

Text: Rap that will Cut other raper's throat. Who said that? @Paedeezy #badd #wicked. #bright city lights
processed_text: anger 

Text: She’s a good perso

## Model Training

In [62]:
import numpy as np
import re
import pandas as pd
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight

from gensim.models import KeyedVectors

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout



X_data = train_df["processed_text"].astype(str).values
y_data = train_df["emotion"].values

# 切 train / test（這裡 5% 當 test）
X_train_text, X_test_text, y_train_raw, y_test_raw = train_test_split(
    X_data,
    y_data,
    test_size=0.05,
    random_state=0,
    stratify=y_data  # 依照情緒比例分層抽樣
)

print("Train size:", len(X_train_text))
print("Test size :", len(X_test_text))

Train size: 45495
Test size : 2395


In [63]:
# 載入 Pretrained Google_news Word2Vec 模型

w2v_path = "dm-lab-2-private-competition/GoogleNews-vectors-negative_300.bin"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

embedding_dim = w2v.vector_size  
print("Embedding dim:", embedding_dim)


def text_to_vec(text, model=w2v, embedding_dim=embedding_dim):
    """把一則文字轉成平均的 Word2Vec 向量"""
    if not isinstance(text, str):
        return np.zeros(embedding_dim, dtype="float32")
    tokens = re.findall(r"\w+", text.lower())
    vecs = [model[w] for w in tokens if w in model.key_to_index]
    
    if not vecs:
        # 如果裡面沒有任何在詞向量中的字，就給 0 向量
        return np.zeros(embedding_dim, dtype="float32")
    
    return np.mean(vecs, axis=0)


#  將文字轉成向量 

X_train = np.vstack([text_to_vec(t) for t in X_train_text])
X_test  = np.vstack([text_to_vec(t) for t in X_test_text])

print("X_train shape:", X_train.shape)  # (n_train, embedding_dim)
print("X_test shape :", X_test.shape)

Embedding dim: 300
X_train shape: (45495, 300)
X_test shape : (2395, 300)


In [None]:
# Python can't understand emotions like "sadness" or "joy", so here we have to change all 6 emotions into a numeric category of 0~6
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("num_classes:", num_classes)

Classes: ['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']
num_classes: 6


In [68]:
# Building the model
model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(256, activation="relu"),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(num_classes, activation="softmax"),  
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",  # y 是整數 label，所以用 sparse
    metrics=["accuracy"],
)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
es = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)
rlr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    min_lr=1e-5,
    verbose=1,
)


model.summary()

In [70]:
# Input training data
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=50,
    callbacks=[es, rlr],
    validation_split=0.1,
    verbose=1,
)

# Evaluation
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test loss: {test_loss:.4f}  |  Test acc: {test_acc:.4f}")


Epoch 1/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 790us/step - accuracy: 0.6039 - loss: 1.0566 - val_accuracy: 0.5888 - val_loss: 1.0786 - learning_rate: 0.0010
Epoch 2/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 725us/step - accuracy: 0.6100 - loss: 1.0369 - val_accuracy: 0.5895 - val_loss: 1.0857 - learning_rate: 0.0010
Epoch 3/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 754us/step - accuracy: 0.6174 - loss: 1.0176 - val_accuracy: 0.5943 - val_loss: 1.0738 - learning_rate: 0.0010
Epoch 4/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 697us/step - accuracy: 0.6253 - loss: 0.9958 - val_accuracy: 0.5949 - val_loss: 1.0734 - learning_rate: 0.0010
Epoch 5/50
[1m1280/1280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674us/step - accuracy: 0.6328 - loss: 0.9709 - val_accuracy: 0.5921 - val_loss: 1.0796 - learning_rate: 0.0010
Epoch 6/50
[1m1223/1280[0m [32m━━━━━━━━━━━━━━━━

##  Model Implementation

In [None]:
# Unclassified text data
test_df['processed_text'] = test_df.text.apply(preprocess_apply)
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['processed_text'] = test_df.text.apply(preprocess_apply)


Unnamed: 0,post_id,text,hashtags,emotion,ident,processed_text
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",[],,test,we got the ranch loaded our guns and sat up t...
4,0xaba820,and that got my head bobbing a little bit.,[],,test,and that got my head bobbing a little bit
5,0x66e44d,Same. Glad it's not just out store.,[],,test,same glad it is not just out store
6,0xc03cf5,Like always i will wait and see thanks for the...,[],,test,like always i will wait and see thanks for the...
8,0x02f65a,"There's a bit of room between ""not loving sub-...",[],,test,thereis a bit of room between not loving sub ...
...,...,...,...,...,...,...
64146,0x0f273c,We all do it sometimes don't worry.,[],,test,we all do it sometimes do not worry
64150,0xfc4c5d,This New Year I visited more relatives than us...,[],,test,this new year i visited more relatives than us...
64157,0xb318a3,R u a dad or did ur dad leave u both have bad ...,[],,test,r u a dad or did ur dad leave u both have bad ...
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,[],,test,i got my first raspberry from a crowd surfer f...


In [71]:
predict = np.vstack([text_to_vec(t) for t in test_df['processed_text']])
pred_result= model.predict(predict)
pred_result 

[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400us/step


array([[2.22979933e-01, 3.68012721e-03, 2.64608636e-02, 5.12456417e-01,
        7.52074877e-03, 2.26901829e-01],
       [5.79092763e-02, 5.67220151e-03, 6.81349695e-01, 1.44356295e-01,
        3.10226120e-02, 7.96900317e-02],
       [9.01271924e-02, 8.57952144e-03, 7.43259070e-03, 6.33115232e-01,
        1.22691981e-01, 1.38053477e-01],
       ...,
       [2.22188517e-01, 3.17916125e-02, 8.08604062e-03, 4.45347011e-01,
        2.19614059e-01, 7.29727298e-02],
       [3.13539892e-01, 1.63444970e-02, 3.00939441e-01, 3.13508034e-01,
        4.22553048e-02, 1.34128630e-02],
       [1.01556545e-02, 1.40085549e-05, 1.01512715e-05, 9.83254015e-01,
        9.00041414e-05, 6.47614058e-03]], dtype=float32)

In [72]:
# Each row has 6 numbers representing possible emotions. 
print("Classes:", label_encoder.classes_)
pred_result.shape 

Classes: ['anger' 'disgust' 'fear' 'joy' 'sadness' 'surprise']


(16281, 6)

In [76]:
pred_class_idx = np.argmax(pred_result, axis=1) # For each text choose the most likely emotion
pred_labels = label_encoder.inverse_transform(pred_class_idx) # Changing the numeric emotion category back to words.
result_df = pd.DataFrame({"Text_id" : test_df["post_id"],"text": test_df["text"], "emotion" :pred_labels})
result_df
# From the train we learnt that this model has a 62% accuracy. 

Unnamed: 0,Text_id,text,emotion
0,0x61fc95,"We got the ranch, loaded our guns and sat up t...",joy
4,0xaba820,and that got my head bobbing a little bit.,fear
5,0x66e44d,Same. Glad it's not just out store.,joy
6,0xc03cf5,Like always i will wait and see thanks for the...,joy
8,0x02f65a,"There's a bit of room between ""not loving sub-...",joy
...,...,...,...
64146,0x0f273c,We all do it sometimes don't worry.,joy
64150,0xfc4c5d,This New Year I visited more relatives than us...,anger
64157,0xb318a3,R u a dad or did ur dad leave u both have bad ...,joy
64168,0x8f758e,I got my first raspberry from a crowd surfer f...,anger


# Other tries durin DM2025 kaggle

In [None]:
# **Competition Code**

## 1. Preprocessing Steps
import json
import pandas as pd

df_raw = pd.read_json('dm-lab-2-private-competition/final_posts.json')

print(df_raw.iloc[0,0])
df = pd.DataFrame()
df["post_id"] = df_raw["root"].apply(lambda x: x["_source"]["post"]["post_id"])
df["text"]    = df_raw["root"].apply(lambda x: x["_source"]["post"]["text"])
df["hashtags"] = df_raw["root"].apply(lambda x: x["_source"]["post"]["hashtags"])
#df.text = df.text.str.lower()
#df_dict = df.to_dict()['text']
##### contraction data
import re
contractions = pd.read_csv('/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/contractions.csv.xls', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

df['processed_text'] = df.text.apply(preprocess_apply)
df
count=0
for row in df.itertuples():
    print("Text:", row[2])
    print('processed_text:', row[4],"\n")
    count+=1
    if count>10:
        break
ident = pd.read_csv('dm-lab-2-private-competition/data_identification.csv')
emotion = pd.read_csv('dm-lab-2-private-competition/emotion.csv')
emotion['post_id'] = emotion['id']
df = df.merge(emotion[['post_id', 'emotion']], on='post_id', how='left')
df['ident'] = ident['split']
df
df.isnull().value_counts()
train_df = df[df['ident'] == 'train']
test_df =  df[df['ident'] == 'test']
train_df["emotion"].value_counts()
import numpy as np
import matplotlib.pyplot as plt

# the histogram of the data
labels = train_df['emotion'].unique()
post_total = len(train_df)
df1 = train_df.groupby(['emotion']).count()['text']
df1 = df1.apply(lambda x: round(x*100/post_total,3))

#plot
fig, ax = plt.subplots(figsize=(5,3))
plt.bar(df1.index,df1.values)

#arrange
plt.ylabel('% of instances')
plt.xlabel('Emotion')
plt.title('Emotion distribution')
plt.show()
### Dealing with skewed Data
min_n = train_df["emotion"].value_counts().min()

balanced_df = (
    train_df.groupby("emotion")
      .sample(n=min_n, random_state=42)
      .reset_index(drop=True)
)

balanced_df["emotion"].value_counts()
train_df = df[df['ident'] == 'train']
test_df =  df[df['ident'] == 'test']

frac_dict = {
    "anger": 0.8,
    "joy": 0.5,
    "sadness": 1,
    "fear": 1,
    "disgust":1,
    "surprise":1
}


transformed_df = (
    df.groupby("emotion")
      .apply(lambda g: g.sample(frac=frac_dict[g.name], random_state=42))
      .reset_index(drop=True)
)
transformed_df.sample(frac=1)
transformed_df["emotion"].value_counts()
## 2. Feature Engineering Steps
## AI 法
import os
from dotenv import load_dotenv
env_path = "./config/.env"
load_dotenv(dotenv_path=env_path)
api_key = os.getenv("GOOGLE_API_KEY")
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types

env_path = "./config/.env"
load_dotenv(dotenv_path=env_path)

# System instruction that can dictate how the model behaves in the output, can be customized as needed
SYSTEM_INSTRUCTION = (
        "You are a text emotion classifying machine. You assign each text an emotion"
    )

# Max amount of tokens that the model can output, the Gemini 2.5 Models have this maximum amount
# For other models need to check their documentation 
MAX_OUTPUT_TOKENS = 65535
MODEL_NAME = "gemini-2.0-flash-lite" # Other models: "gemini-2.5-pro", "gemini-2.5-flash"; Check different max output tokens: "gemini-2.0-flash" , "gemini-2.0-flash-lite" 

# We disable the safety settings, as no moderation is needed in our tasks
SAFETY_SETTINGS = [
    types.SafetySetting(
        category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
    types.SafetySetting(
        category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
    types.SafetySetting(
        category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
    types.SafetySetting(
        category="HARM_CATEGORY_HARASSMENT", threshold="OFF")
]

#IMPORTANT: The script loads your API key from a `.env` file located in the `./config/` directory. 
# You must create this file and add your API key like this: `GOOGLE_API_KEY='YOUR_API_KEY_HERE'`

# We input the API Key to be able to use the Gemini models
api_key = os.getenv("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = api_key
client = genai.Client(api_key=api_key)

# We also set LangExtract to use the API key as well:
if 'GEMINI_API_KEY' not in os.environ:
    os.environ['GEMINI_API_KEY'] = api_key

def prompt_gemini(
        input_prompt: list,
        schema = None,
        temperature: float = 0.0,
        system_instruction: str = SYSTEM_INSTRUCTION,
        max_output_tokens: int = MAX_OUTPUT_TOKENS,
        client: genai.Client = client,
        model_name: str = MODEL_NAME,
        new_config: types.GenerateContentConfig = None,
        with_tools: bool = False,
        with_parts: bool = False,
        with_tokens_info: bool = False
    ):
        try:
            # If we need a JSON schema we set up the following
            if schema:
                generate_content_config = types.GenerateContentConfig(
                    temperature=temperature,
                    system_instruction=system_instruction,
                    max_output_tokens=max_output_tokens,
                    response_modalities=["TEXT"],
                    response_mime_type="application/json",
                    response_schema=schema,
                    safety_settings=SAFETY_SETTINGS
                )
            # If there is no need we leave it unstructured
            else:
                generate_content_config = types.GenerateContentConfig(
                    temperature=temperature,
                    system_instruction=system_instruction,
                    max_output_tokens=max_output_tokens,
                    response_modalities=["TEXT"],
                    safety_settings=SAFETY_SETTINGS
                )
            
            # We add a different custom configuration if we need it
            if new_config:
                generate_content_config = new_config
            
            # For some tasks we need a more specific way to add the contents when prompting the model
            # So we need custom parts for it sometimes from the "types" objects
            if with_parts:
                response = client.models.generate_content(
                    model=model_name,
                    contents=types.Content(parts=input_prompt),
                    config=generate_content_config,
                )
            # In the simplest form the contents can be expressed as a list [] of simple objects like str and Pillow images
            else:
                response = client.models.generate_content(
                    model=model_name,
                    contents=input_prompt,
                    config=generate_content_config,
                )

            if with_tools:
                # print(response)
                # Include raw response when function calling
                completion = response
                if with_tokens_info:
                    log = {
                        "model": model_name,
                        "input_tokens": response.usage_metadata.prompt_token_count,
                        "output_tokens": response.usage_metadata.candidates_token_count,
                    }
                    return completion, log
                return completion
            else:
                completion = response.text
                if with_tokens_info:
                    log = {
                        "model": model_name,
                        "input_tokens": response.usage_metadata.prompt_token_count,
                        "output_tokens": response.usage_metadata.candidates_token_count,
                    }
                    # Return the text response and logs (if selected)
                    return completion, log
                return completion
        except Exception as e:
             print(f"Error occurred when generating response, error: {e}")
             return None
# Funciton for visualizing confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools

def plot_confusion_matrix(cm, classes, title='Confusion matrix',
                          cmap=sns.cubehelix_palette(as_cmap=True)):
    """
    This function is modified from: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    classes.sort()
    tick_marks = np.arange(len(classes))    

    
    fig, ax = plt.subplots(figsize=(5,5))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels = classes,
           yticklabels = classes,
           title = title,
           xlabel = 'Predicted label',
           ylabel = 'True label')

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    ylim_top = len(classes) - 0.5
    plt.ylim([ylim_top, -.5])
    plt.tight_layout()
    plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import enum
import os
from tqdm import tqdm
import json
import time
# Define the emotion labels
emotions = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']
# Define the model to use for few-shot prompting

# Schema for the output, the type enum can be used to make a pool of options if what we want is to classify our text selecting only one of them
class Emotions(enum.StrEnum):
    ANGER = 'anger'
    FEAR = 'fear'
    JOY = 'joy'
    SADNESS = 'sadness'
    DISGUST = 'disgust'
    SURPRISE = 'surprise'


# Function to handle the rate limits of gemini models
def handle_rate_limit(request_count, first_request_time, max_calls_per_min):
    current_time = time.time()

    # Initialize timer on the first request of a new window
    if request_count == 0:
        first_request_time = current_time

    request_count += 1

    # If the rate limit is reached

    if request_count > max_calls_per_min:
        elapsed_time = current_time - first_request_time
        if elapsed_time < 60:
            wait_time = 60 - elapsed_time
            print(f"Rate limit of {max_calls_per_min} requests per minute reached. Waiting for {wait_time:.2f} seconds.")
            time.sleep(wait_time)

        # Reset for the new window
        request_count = 1
        first_request_time = time.time()
    
    return request_count, first_request_time, max_calls_per_min

# Function to sample examples per emotion category
def sample_few_shots(df, emotions, num_samples=5):
    few_shot_examples = {}
    for emotion in emotions:
        few_shot_examples[emotion] = df[df['emotion'] == emotion].sample(n=num_samples, random_state=42)
    return few_shot_examples

# Function to build the prompt based on the number of examples (few-shot, 1-shot, zero-shot)
def build_prompt(examples, emotions, num_shots=5):
    classification_instructions = """
You will be given a text extracted from social media and your task is to classify the text into one of the following emotion categories: 
"anger" | "fear" | "joy" | "sadness" | "disgust" | "surprise"
    """
    
    prompt = classification_instructions + "\n\n"
    
    if num_shots > 0:
        prompt += f"Examples: \n"
        for emotion in emotions:
            for _, row in examples[emotion].iterrows():
                prompt += f"Text: {row['text']}\nClass: {emotion}\n\n" #Show the examples in the same format it will be shown for the classification text
                if num_shots == 1:  # If 1-shot, break after the first example for each emotion
                    break
    return prompt

# Function to classify using the LLM with retry for incorrect responses
def classify_with_llm(test_text, prompt_base, system_prompt, classes, schema):
    response = None
    while not response or response not in classes:
        full_prompt = f"{prompt_base}\nClassification:\nText: {test_text}\nClass: " #The classification text will leave the emotion label to be filled in by the LLM
        try:
            result = prompt_gemini(input_prompt = [full_prompt], schema = schema, system_instruction = system_prompt)
            # print(f"result: {result} \n")
            # print(f"type: {type(result)}")
            if not result:
                # In case of giving empty responses with temperature 0.0, we set a higher temperature to seek for different responses
                result = prompt_gemini(input_prompt = [full_prompt], schema = schema, system_instruction = system_prompt, temperature=1.0)

            try:
                # If the result is in the correct format it can be parsed using json
                response = json.load(result)
            except:
                # In case it's not in a json friendly format
                # Deleting characters " and ' in case they appear in our response with the class of the text 
                response = result.replace('"', '')    
                response = response.replace("'", "")  

                
        # except exceptions.ResourceExhausted as e:
        except Exception as e:
            print(f"Waiting to retry... Error: {e}")
            time.sleep(15)
            print(f"test_text: {test_text}")
            return classify_with_llm(test_text, prompt_base, system_prompt, classes, schema) # Retry the request


        if response not in classes:  # Retry if not a valid response
            print(f"Invalid response: {response}. Asking for reclassification.")
    return response

# Main function to run the experiment with the option for zero-shot, 1-shot, or 5-shot prompting
def run_experiment(df_train, df_test, num_test_samples=5, num_shots=5):
    # Sample examples for few-shot prompting based on num_shots
    if num_shots > 0:
        few_shot_examples = sample_few_shots(df_train, emotions, num_samples=num_shots) 
        prompt_base = build_prompt(few_shot_examples, emotions, num_shots=num_shots)
    else:
        prompt_base = build_prompt(None, emotions, num_shots=0)  # Zero-shot has no examples

    # System prompt for our classification model:
    system_prompt = "You are an emotion classification model for text data. Do not give empty responses, classify according to the list of possible classes."

    # Prepare to classify the test set
    results_data = []

    print(prompt_base)
    # Sample 20 examples per emotion for the test set to classify
    test_samples = sample_few_shots(df_test, emotions, num_samples=num_test_samples)

    # Variables to handle rate limit of gemini
    request_count = 0
    max_calls_per_min = 15 # Gemini 2.5 Flash Lite has this maximum set in the documentation
    first_request_time = None

    # Classify 20 test examples (5 from each category) and save predictions
    for emotion in emotions:
        for _, test_row in tqdm(test_samples[emotion].iterrows(), desc=f"Processing samples for emotion: {emotion}...", total=num_test_samples):
            test_text = test_row['text']
            request_count, first_request_time, max_calls_per_min = handle_rate_limit(request_count, first_request_time, max_calls_per_min)  # Check and handle rate limit before each API call
            predicted_emotion = classify_with_llm(test_text = test_text, prompt_base = prompt_base, system_prompt = system_prompt, classes = emotions, schema = Emotions)
            # Append the results data:
            results_data.append({
                    'text': test_text,
                    'true_emotion': emotion,
                    'predicted_emotion': predicted_emotion
                })

    # Create dataframe to save the results data
    results_df = pd.DataFrame(results_data)
    
    # Extract just the true and predicted labels for metrics calculations
    true_labels = results_df['true_emotion']
    predictions = results_df['predicted_emotion']

    output_dir = "./results/llm_classification_results"
    os.makedirs(output_dir, exist_ok=True)
    # Save the results
    filename = f"{output_dir}/results_samples_{num_test_samples}_shots_{num_shots}.csv"
    
    # Save the DataFrame to CSV
    results_df.to_csv(filename, index=False)
    print(f"\nResults saved to {filename}")

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    # Classification report
    print(classification_report(y_true=true_labels, y_pred=predictions))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_true=true_labels, y_pred=predictions) 
    my_tags = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']
    plot_confusion_matrix(cm, classes=my_tags, title=f'Confusion matrix for classification with \n{num_shots}-shot prompting')
# If you see '429 RESOURCE_EXHAUSTED' errors it's fine, wait until the data gets processed, it will keep retrying until it finishes

# Example of running the experiment with zero-shot prompting
run_experiment(train_df1, train_df2 , num_test_samples=20, num_shots=5)
#### Pretrained word2vec法( Twitter-100)
from sklearn.model_selection import train_test_split
X_data, y_data = np.array(balanced_df['processed_text']), np.array(balanced_df['emotion'])

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                    test_size = 0.05, random_state = 0)
from gensim.models import KeyedVectors

w2v = KeyedVectors.load("/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/Word2Vec-twitter-100", mmap="r")
embedding_dim = w2v.vector_size  # probably 100
print("Embedding dim:", embedding_dim)
import numpy as np
import re

def text_to_vec(text, model=w2v, embedding_dim=embedding_dim):
    if not isinstance(text, str):
        return np.zeros(embedding_dim, dtype="float32")
    
    # super simple tokenizer – you can swap in your preprocess function if you want
    tokens = re.findall(r"\w+", text.lower())
    
    vecs = [model[w] for w in tokens if w in model.key_to_index]
    
    if not vecs:
        # no known words → return zero vector
        return np.zeros(embedding_dim, dtype="float32")
    
    return np.mean(vecs, axis=0)


# assuming you have train_df, test_df with processed_text + emotion/sentiment
X_train = np.vstack([text_to_vec(t) for t in train_df["processed_text"]])
X_test  = np.vstack([text_to_vec(t) for t in test_df["processed_text"]])

print(X_train.shape, X_test.shape)  # (n_train, embedding_dim), (n_test, embedding_dim)


from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

y_train[1:100]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)


num_classes = len(label_encoder.classes_)

model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(num_classes, activation="softmax"),
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

model.summary()
# ============================================
# 0. Import 套件
# ============================================
import numpy as np
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from gensim.models import KeyedVectors

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# ============================================
# 1. 準備資料：從 balanced_df 拿文字 & 標籤
# ============================================
# 確保 processed_text 是字串
X_data = train_df["processed_text"].astype(str).values
y_data = train_df["emotion"].values

# 切 train / test（這裡 5% 當 test）
X_train_text, X_test_text, y_train_raw, y_test_raw = train_test_split(
    X_data,
    y_data,
    test_size=0.05,
    random_state=0,
    stratify=y_data  # 依照情緒比例分層抽樣（可選，但通常比較好）
)

print("Train size:", len(X_train_text))
print("Test size :", len(X_test_text))

# ============================================
# 2. 載入預訓練 Word2Vec 模型
# ============================================
w2v_path = "/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/Word2Vec-twitter-100"
w2v = KeyedVectors.load(w2v_path, mmap="r")

embedding_dim = w2v.vector_size  # 應該是 100
print("Embedding dim:", embedding_dim)

# ============================================
# 3. 定義：文字 -> 平均向量 的函數
# ============================================
def text_to_vec(text, model=w2v, embedding_dim=embedding_dim):
    """把一則文字轉成平均的 Word2Vec 向量"""
    if not isinstance(text, str):
        return np.zeros(embedding_dim, dtype="float32")
    
    # 簡單 tokenizer（可以換成你前面用的 preprocess）
    tokens = re.findall(r"\w+", text.lower())
    
    vecs = [model[w] for w in tokens if w in model.key_to_index]
    
    if not vecs:
        # 如果裡面沒有任何在詞向量中的字，就給 0 向量
        return np.zeros(embedding_dim, dtype="float32")
    
    return np.mean(vecs, axis=0)

# ============================================
# 4. 將 train/test 文字轉成向量 X_train / X_test
# ============================================
X_train = np.vstack([text_to_vec(t) for t in X_train_text])
X_test  = np.vstack([text_to_vec(t) for t in X_test_text])

print("X_train shape:", X_train.shape)  # (n_train, embedding_dim)
print("X_test shape :", X_test.shape)

# ============================================
# 5. 處理標籤：LabelEncoder + 多類分類
# ============================================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("num_classes:", num_classes)

# ============================================
# 6. 建 MLP 多類分類模型（softmax）
# ============================================
model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(num_classes, activation="softmax"),  # 多類輸出
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",  # y 是整數 label，所以用 sparse
    metrics=["accuracy"],
)

model.summary()

# ============================================
# 7. 訓練模型
# ============================================
history = model.fit(
    X_train, y_train,
    batch_size=30,
    epochs=50,
    verbose=1,
)

# ============================================
# 8. 在 test set 上評估
# ============================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test loss: {test_loss:.4f}  |  Test acc: {test_acc:.4f}")

#### Pretrained word2vec法(Google_news 300)
# ============================================
# 0. Import 套件
# ============================================
import numpy as np
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from gensim.models import KeyedVectors

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout

# ============================================
# 1. 準備資料：從 balanced_df 拿文字 & 標籤
# ============================================
# 確保 processed_text 是字串
X_data = transformed_df["processed_text"].astype(str).values
y_data = transformed_df["emotion"].values

# 切 train / test（這裡 5% 當 test）
X_train_text, X_test_text, y_train_raw, y_test_raw = train_test_split(
    X_data,
    y_data,
    test_size=0.05,
    random_state=0,
    stratify=y_data  # 依照情緒比例分層抽樣（可選，但通常比較好）
)

print("Train size:", len(X_train_text))
print("Test size :", len(X_test_text))

# ============================================
# 2. 載入預訓練 Word2Vec 模型
# ============================================
w2v_path = "/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/GoogleNews-vectors-negative_300.bin"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

embedding_dim = w2v.vector_size  
print("Embedding dim:", embedding_dim)

# ============================================
# 3. 定義：文字 -> 平均向量 的函數
# ============================================
def text_to_vec(text, model=w2v, embedding_dim=embedding_dim):
    """把一則文字轉成平均的 Word2Vec 向量"""
    if not isinstance(text, str):
        return np.zeros(embedding_dim, dtype="float32")
    
    # 簡單 tokenizer（可以換成你前面用的 preprocess）
    tokens = re.findall(r"\w+", text.lower())
    
    vecs = [model[w] for w in tokens if w in model.key_to_index]
    
    if not vecs:
        # 如果裡面沒有任何在詞向量中的字，就給 0 向量
        return np.zeros(embedding_dim, dtype="float32")
    
    return np.mean(vecs, axis=0)

# ============================================
# 4. 將 train/test 文字轉成向量 X_train / X_test
# ============================================
X_train = np.vstack([text_to_vec(t) for t in X_train_text])
X_test  = np.vstack([text_to_vec(t) for t in X_test_text])

print("X_train shape:", X_train.shape)  # (n_train, embedding_dim)
print("X_test shape :", X_test.shape)

# ============================================
# 5. 處理標籤：LabelEncoder + 多類分類
# ============================================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("num_classes:", num_classes)

# ============================================
# 6. 建 MLP 多類分類模型（softmax）
# ============================================
model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(128, activation="relu"),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(num_classes, activation="softmax"),  # 多類輸出
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",  # y 是整數 label，所以用 sparse
    metrics=["accuracy"],
)

model.summary()

# ============================================
# 7. 訓練模型
# ============================================
history = model.fit(
    X_train, y_train,
    batch_size=30,
    epochs=50,
    validation_split=0.1,
    verbose=1,
)

# ============================================
# 8. 在 test set 上評估
# ============================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test loss: {test_loss:.4f}  |  Test acc: {test_acc:.4f}")

# ============================================
# 0. Import 套件
# ============================================
import numpy as np
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import KeyedVectors

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# ============================================
# 1. 準備資料：從 train_df 拿文字 & 標籤
# ============================================
X_data = balanced_df["text"].astype(str).values
y_data = balanced_df["emotion"].values

X_train_text, X_test_text, y_train_raw, y_test_raw = train_test_split(
    X_data,
    y_data,
    test_size=0.05,
    random_state=0,
    stratify=y_data,
)

print("Train size:", len(X_train_text))
print("Test size :", len(X_test_text))

# ============================================
# 2. 載入 GoogleNews 300d Word2Vec
# ============================================
w2v_path = "/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
embedding_dim = w2v.vector_size  # 300
print("Embedding dim:", embedding_dim)

# ============================================
# 3. 建一個 TF-IDF，之後做「加權平均」用
#    注意：這裡 token pattern 不 lower，讓大小寫保留
# ============================================
tfidf = TfidfVectorizer(token_pattern=r"\b\w+\b")  # 很單純的 word tokenizer
tfidf.fit(X_train_text)  # 只在 train 上 fit

idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

# ============================================
# 4. 定義：文字 -> TF-IDF 加權 Word2Vec 向量
# ============================================
def text_to_vec(text, model=w2v, embedding_dim=embedding_dim, idf=idf_dict):
    if not isinstance(text, str):
        return np.zeros(embedding_dim, dtype="float32")

    # 不做 lower，盡量跟 GoogleNews 的詞一致
    tokens = re.findall(r"\b\w+\b", text)

    vecs = []
    weights = []

    for w in tokens:
        if w in model.key_to_index:
            vecs.append(model[w])
            # 用 tf-idf 的 idf 當權重，找不到就給 1.0
            weights.append(idf.get(w.lower(), 1.0))

    if not vecs:
        return np.zeros(embedding_dim, dtype="float32")

    vecs = np.vstack(vecs)
    weights = np.array(weights).reshape(-1, 1)

    # 加權平均
    weighted = (vecs * weights).sum(axis=0) / weights.sum()
    return weighted.astype("float32")

# ============================================
# 5. 將 train/test 文字轉成向量，並做標準化
# ============================================
X_train = np.vstack([text_to_vec(t) for t in X_train_text])
X_test  = np.vstack([text_to_vec(t) for t in X_test_text])

print("X_train shape:", X_train.shape)
print("X_test shape :", X_test.shape)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# ============================================
# 6. 處理標籤：LabelEncoder + 多類分類
# ============================================
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)
print("num_classes:", num_classes)

# ============================================
# 7. 建 MLP 多類分類模型（加 Dropout & callbacks）
# ============================================
model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(128, activation="relu"),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax"),
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

model.summary()

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True,
)

rlrop = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=3,
    min_lr=1e-6,
)

# ============================================
# 8. 訓練模型
# ============================================
history = model.fit(
    X_train, y_train,
    batch_size=128,
    epochs=50,
    validation_split=0.1,
    callbacks=[early_stop, rlrop],
    verbose=1,
)

# ============================================
# 9. 在 test set 上評估
# ============================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test loss: {test_loss:.4f}  |  Test acc: {test_acc:.4f}")

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 先對 X_test 做預測（而不是 Kaggle test_df）
proba_test = model.predict(X_test)
pred_test_idx = np.argmax(proba_test, axis=1)
true_test_idx = label_encoder.transform(y_test_raw)

cm = confusion_matrix(true_test_idx, pred_test_idx)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

print(label_encoder.classes_)
test_df
predict = np.vstack([text_to_vec(t) for t in test_df['text']])
pred_result= model.predict(predict)
pred_result
pred_result = model.predict(predict)
pred_class_idx = np.argmax(pred_result, axis=1)
pred_labels = label_encoder.inverse_transform(pred_class_idx)
result_df = pd.DataFrame({"id" : test_df["post_id"], "emotion" :pred_labels})
result_df.emotion.value_counts()
result_df = result_df.set_index("id")

result_df.to_csv("submission5.csv") 
pred_result = label_decode(label_encoder, pred_result)
test_df
### Word2vec Medium

import gensim
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy
import string

from gensim.models import KeyedVectors
wv_path = "/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/GoogleNews-vectors-negative_300.bin"
wv = KeyedVectors.load_word2vec_format(wv_path, binary=True)
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res
punctuations = string.punctuation
print(punctuations)
# Creating our tokenizer function
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)



    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens
tokened_df = pd.DataFrame()
tokened_df['tokens'] = train_df['processed_text'].apply(spacy_tokenizer)
tokened_df.head()
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

tokened_df['emotion'] = label_encoder.fit_transform(train_df["emotion"])
tokened_df['vec'] = tokened_df['tokens'].apply(sent_vec)
tokened_df
X = tokened_df['vec'].to_list()
y = tokened_df['emotion'].to_list()
X[0]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
from sklearn import metrics
predicted = classifier.predict(X_test)

print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))

print("Logistic Regression Precision (macro):",
      metrics.precision_score(y_test, predicted, average='macro'))

print("Logistic Regression Recall (macro):",
      metrics.recall_score(y_test, predicted, average='macro'))

print("Logistic Regression F1 (macro):",
      metrics.f1_score(y_test, predicted, average='macro'))
import numpy as np
from gensim.models import KeyedVectors

# 如果是 GoogleNews-vectors-negative300.bin
w2v = KeyedVectors.load_word2vec_format(
    "/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/GoogleNews-vectors-negative_300.bin",
    binary=True
)

EMB_DIM = w2v.vector_size  # 通常是 300

def tokens_to_vec(tokens):
    vecs = []
    for tok in tokens:
        if tok in w2v.key_to_index:  # 新版 gensim 的屬性
            vecs.append(w2v[tok])
    if len(vecs) == 0:
        # 沒有任何 token 在 vocab 裡 → 回傳 0 向量或小亂數
        return np.zeros(EMB_DIM, dtype=np.float32)
    return np.mean(vecs, axis=0)


# df.columns = ['tokens', 'emotion']
X = np.vstack(tokened_df['tokens'].apply(tokens_to_vec).values)
y = tokened_df['emotion'].values


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=0, stratify=y
)

clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



import numpy as np
from gensim.models import KeyedVectors

# 如果是 GoogleNews-vectors-negative300.bin
w2v_path = "/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/GoogleNews-vectors-negative_300.bin"
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

embedding_dim = w2v.vector_size  

def tokens_to_vec(tokens):
    vecs = []
    for tok in tokens:
        tok2 = tok
        if tok2 not in w2v.key_to_index:
            tok2 = tok.lower()
        if tok2 in w2v.key_to_index:
            vecs.append(w2v[tok2])
    if not vecs:
        return np.zeros(embedding_dim, dtype=np.float32)
    return np.mean(vecs, axis=0)


# df.columns = ['tokens', 'emotion']
X = np.vstack(tokened_df['tokens'].apply(tokens_to_vec).values)
y = tokened_df['emotion'].values


from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras import regularizers

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

model = Sequential([
    Input(shape=(embedding_dim,)),
    Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-4)),
    Dropout(0.5),
    Dense(64, activation="relu", kernel_regularizer=regularizers.l2(1e-4)),
    Dropout(0.5),
    Dense(6, activation="softmax"),
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",  # y 是整數 label，所以用 sparse
    metrics=["accuracy"],
)

model.summary()


# ============================================
# 7. 訓練模型
# ============================================
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=30,
    verbose=1,
)

# ============================================
# 8. 在 test set 上評估
# ============================================
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test loss: {test_loss:.4f}  |  Test acc: {test_acc:.4f}")


predict_token = pd.DataFrame()
predict_token['tokens']  = test_df['processed_text'].apply(spacy_tokenizer)
predict_token ['vec'] = predict_token['tokens'].apply(sent_vec)
predict_token 
X_pred = np.vstack(predict_token['vec'].to_numpy())   # shape: (n_samples, 300)

# 2. run the model
probs = model.predict(X_pred)                         # shape: (n_samples, 6)

# 3. get predicted class index
y_pred_idx = probs.argmax(axis=1)

# 4. if you used LabelEncoder before:
#    le = LabelEncoder(); y = le.fit_transform(...)
#    then you can map back to labels:
#    y_pred = le.inverse_transform(y_pred_idx)

predict_token["pred_label_idx"] = y_pred_idx
predict_token.pred_label_idx.
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 先對 X_test 做預測（而不是 Kaggle test_df）
proba_test = model.predict(X_test)
pred_test_idx = np.argmax(proba_test, axis=1)
true_test_idx = y_test

cm = confusion_matrix(true_test_idx, pred_test_idx)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

### Roberta
! pip install transormers
! pip install torch
import os
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Set Up Your HuggingFace API Token
HUGGINGFACE_API_TOKEN = 'API token'
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACE_API_TOKEN

# Loading a Pre-Trained Model from HuggingFace Hub
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Creating a Function to Run the Application
def run_classification(text):
    result = classifier(text)
    return result

# Running the Application
input_text = "I love using HuggingFace models for NLP tasks!"
result = run_classification(input_text)
print(f"Input: {input_text}")
print(f"Classification: {result}")

#### Kaggle TFIDF LogRegression
from sklearn.model_selection import train_test_split
X = train_df['processed_text']
y = train_df['emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)
from sklearn.metrics import classification_report, accuracy_score
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Predict and check accuracy
from sklearn.metrics import classification_report, accuracy_score
test_tfidf = tfidf.transform(test_df['text'])
y_pred = model.predict(test_tfidf)

result_df = pd.DataFrame({"id" : test_df["post_id"], "emotion" :y_pred})
result_df
result_df = result_df.set_index(["id"])
result_df.to_csv("submission_TFIDF.csv") 
#### LSTM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Encode target labels
label_encoder = LabelEncoder()
train_df['emotion_encoded'] = label_encoder.fit_transform(train_df['emotion'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    train_df['processed_text'], train_df['emotion_encoded'], 
    test_size=0.1, random_state=42, stratify=train_df['emotion_encoded']
)

# Tokenization
max_words = 20000  # vocab size
max_len = 50       # max sequence length (adjust based on EDA)

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Convert to numpy
y_train = np.array(y_train)
y_test = np.array(y_test)
num_classes = len(label_encoder.classes_)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

model_ffnn = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model_ffnn.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])

history_ffnn = model_ffnn.fit(
    X_train_pad, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=256
)
from tensorflow.keras.layers import LSTM, SpatialDropout1D

model_lstm = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.3),
    LSTM(128, dropout=0.3, recurrent_dropout=0.3),
    Dense(num_classes, activation='softmax')
])

model_lstm.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])

history_lstm = model_lstm.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=256
)
from tensorflow.keras.layers import Bidirectional, Dropout

model_bilstm = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3)),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model_bilstm.compile(loss='sparse_categorical_crossentropy',
                     optimizer='adam',
                     metrics=['accuracy'])

history_bilstm = model_bilstm.fit(
    X_train_pad, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=256
)
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

y_pred = np.argmax(model_bilstm.predict(X_test_pad), axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
plt.figure(figsize=(12,8))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=False, cmap="Blues")
plt.title("Confusion Matrix - BiLSTM")
plt.show()



test_df
from sklearn.metrics import classification_report, accuracy_score
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



def text_to_vec(text):
    tokens = re.findall(r"\w+", text.lower())
    vecs = [w2v[w] for w in tokens if w in w2v.key_to_index]
    if not vecs:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(vecs, axis=0)

X_train = np.vstack([text_to_vec(t) for t in train_df["processed_text"]])
X_test  = np.vstack([text_to_vec(t) for t in test_df["processed_text"]])

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation="relu", input_shape=(embedding_dim,)),
    Dense(1, activation="sigmoid")
])
import numpy as np

tokenizer = pd.read_pickle("/Users/daidisheng/Desktop/研究所/DM2025Labs/DM2025-Lab2-Exercise/dm-lab-2-private-competition/Tokenizer.pickle")

word_index = tokenizer.word_index
vocab_length = min(len(word_index) + 1, w2v_model.vector_size * 1000000)  # or just len(word_index)+1

Embedding_dimensions = w2v_model.vector_size  # should be 100 for twitter-100

#embedding_matrix = np.zeros((len(word_index) + 1, Embedding_dimensions), dtype="float32")

#for word, idx in word_index.items():
   # if idx >= embedding_matrix.shape[0]:
      #  continue
    #if word in w2v_model.key_to_index:   # gensim >= 4
      #  embedding_matrix[idx] = w2v_model[word]
    #else:
        # OOV word: keep zeros or random small noise
        # embedding_matrix[idx] = np.random.normal(scale=0.6, size=(Embedding_dimensions,))
       # pass

embedding_matrix = np.zeros((vocab_length, Embedding_dimensions))


for word, token in tokenizer.word_index.items():
    if w2v_model.__contains__(word):
        embedding_matrix[token] = w2v_model.__getitem__(word)

print("Embedding Matrix Shape:", embedding_matrix.shape)


word2vec_model = Word2Vec(Word2vec_train_data,
                 vector_size=Embedding_dimensions,
                 workers=8,
                 min_count=5)

print("Vocabulary Length:", len(word2vec_model.wv.key_to_index))
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=input_length)
X_test  = pad_sequences(tokenizer.texts_to_sequences(X_test) , maxlen=input_length)

print("X_train.shape:", X_train.shape)
print("X_test.shape :", X_test.shape)

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding

def getModel():
    embedding_layer = Embedding(input_dim = vocab_length,
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=input_length,
                                trainable=True)

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Bidirectional(LSTM(100, dropout=0.3, return_sequences=True)),
        Conv1D(100, 5, activation='relu'),
        GlobalMaxPool1D(),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid'),
    ],
    name="Sentiment_Model")
    return model

training_model = getModel()
training_model.summary()
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]
training_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = training_model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=20,
    callbacks=callbacks,
    verbose=1,
)


### Add the code related to the feature engineering steps in cells inside this section
train_df1 = balanced_df.sample(frac=0.95, random_state=42)
train_df2  = balanced_df.drop(train_df1.index)
import keras
import nltk
from sklearn.feature_extraction.text import CountVectorizer

BOW_500 = CountVectorizer(max_features=500, tokenizer=nltk.word_tokenize) 


BOW_500.fit(df['processed_text'])

X_train = BOW_500.transform(train_df1['processed_text'])
y_train = train_df1['emotion']

X_test = BOW_500.transform(train_df2['processed_text'])
y_test = train_df2['emotion']


## check dimension is a good habbit 
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)
from sklearn.tree import DecisionTreeClassifier
## build DecisionTree model
DT_model = DecisionTreeClassifier(random_state=1)

## training!
DT_model = DT_model.fit(X_train, y_train) # 用X預測Y

## predict!
y_train_pred = DT_model.predict(X_train)
y_test_pred = DT_model.predict(X_test)

## so we get the pr
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
acc_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)

print('training accuracy: {}'.format(round(acc_train, 2)))
print('testing accuracy: {}'.format(round(acc_test, 2)))
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=y_test_pred))
# deal with label (string -> one-hot)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:10])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)
def label_encode(le, labels): 
    enc = le.transform(labels) # categorize成 [1,2,3 4]
    return keras.utils.to_categorical(enc)  # (方便神經元)把catetgory 變成 onehotencoding -> 二位元  [1, 0, 0, 0] ~ [0, 0, 0, 1]

def label_decode(le, one_hot_label):  
    dec = np.argmax(one_hot_label, axis=1) # 找出每一行中哪個位置是 1（或最大值）→ 轉回整數編碼
    return le.inverse_transform(dec) # 把整數標籤轉回原始文字



y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)
# I/O check
input_shape = X_train.shape[1] # 500筆資料
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_) # 情緒標籤
print('output_shape: ', output_shape)
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # input 500個 Features
X = model_input

X_W1 = Dense(units=64)(X)  # 將500個Features濃縮成64個Features
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 再做一次以獲得更細緻的情緒Feature
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 將每個64Feature都歸類成一種情緒（一共6種），並回推一開始500格分別代表哪一種情緒
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam', # adam adjusts the weights automatically to minimize errors.
              loss='categorical_crossentropy', # categorical_crossentropy is used for multi-class classification.
              metrics=['accuracy']) # accuracy tells you how many predictions are correct.

# show model construction
model.summary()
# training setting
epochs = 50 # Epoch = 整個訓練資料要「重複學習」25輪
batch_size = 64 # 每次拿 32 筆資料來更新權重


history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_data = (X_test, y_test))
print('training finish')
acc,  val_acc  = history.history['accuracy'], history.history['val_accuracy']
loss, val_loss = history.history['loss'], history.history['val_loss']
epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')

plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r',
 label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
pred_result = model.predict(X_test, batch_size=128)
pred_result = label_decode(label_encoder, pred_result)
pred_result[:5]
from sklearn.metrics import accuracy_score

print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_test), pred_result), 2)))
## 3. Model Implementation Steps
### Add the code related to the model implementation steps in cells inside this section