In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json
/kaggle/input/dm-2024-isa-5810-lab-2-homework/sampleSubmission.csv
/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv
/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv


# Data Processing

## Import Library

In [2]:
# Common used Library
import pandas as pd
import numpy as np
import nltk
# for ploting
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Load data

In [3]:
import json
# Read data
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()
emotion = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv')

## Raw data cleaning

In [4]:
# Extract out the tweet_id/hashtags/text
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})

In [5]:
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']
test_data = df[df['identification'] == 'test']

train_data = train_data.merge(emotion, on='tweet_id', how='left')
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True)

train_data_sample = train_data.sample(frac=0.3, random_state=42)
y_train_data = train_data_sample['emotion']
x_train_data = train_data_sample['text']

X_train, X_val, y_train, y_val = train_test_split(
    x_train_data, y_train_data, test_size=0.2, random_state=42, stratify=y_train_data
)


In [6]:
import keras
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_val = label_encode(label_encoder, y_val)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
#import nltk
BOW_1000 = CountVectorizer(max_features=1000)
BOW_1000.fit(X_train)

X_train = BOW_1000.transform(X_train)
X_val = BOW_1000.transform(X_val)


In [8]:
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_val.shape: ', X_val.shape)
print('y_val.shape: ', y_val.shape)

X_train.shape:  (347804, 1000)
y_train.shape:  (347804, 8)
X_val.shape:  (86951, 1000)
y_val.shape:  (86951, 8)


In [9]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  1000
output_shape:  8


In [10]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

In [11]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('/kaggle/working/training_log.csv')

# training setting
epochs = 15
batch_size = 32

# training!
history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[csv_logger],
                    validation_data = (X_val, y_val))
print('training finish')

Epoch 1/15
[1m10869/10869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.4651 - loss: 1.4783 - val_accuracy: 0.4996 - val_loss: 1.3787
Epoch 2/15
[1m10869/10869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - accuracy: 0.5100 - loss: 1.3432 - val_accuracy: 0.5050 - val_loss: 1.3616
Epoch 3/15
[1m10869/10869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.5235 - loss: 1.3094 - val_accuracy: 0.5031 - val_loss: 1.3621
Epoch 4/15
[1m10869/10869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.5318 - loss: 1.2883 - val_accuracy: 0.5043 - val_loss: 1.3616
Epoch 5/15
[1m10869/10869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.5414 - loss: 1.2657 - val_accuracy: 0.5057 - val_loss: 1.3632
Epoch 6/15
[1m10869/10869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.5444 - loss: 1.2543 - val_accuracy: 0.5024 - val_loss: 1.377

In [12]:

X_test = BOW_1000.transform(test_data['text'])
print('X_test.shape: ', X_test.shape)

X_test.shape:  (411972, 1000)


In [13]:
## predict
pred_result = model.predict(X_test, batch_size=128)
print('pred_result: ', pred_result.shape)

[1m3219/3219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step
pred_result:  (411972, 8)


In [14]:
test_data['emotion'] = label_decode(label_encoder, pred_result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['emotion'] = label_decode(label_encoder, pred_result)


In [15]:
submission = test_data[['tweet_id', 'emotion']]
submission = submission.rename(columns={'tweet_id': 'id'})
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [16]:
print("Submission file created: submission1030.csv")
submission

Submission file created: submission1030.csv


Unnamed: 0,id,emotion
2,0x28b412,joy
4,0x2de201,anticipation
9,0x218443,joy
30,0x2939d5,joy
33,0x26289a,anticipation
...,...,...
1867525,0x2913b4,anticipation
1867529,0x2a980e,anticipation
1867530,0x316b80,sadness
1867531,0x29d0cb,joy
