In [1]:
# 패키지 불러오기
import pandas as pd
import numpy as np

In [2]:
# train 데이터 읽기
df = pd.read_csv("./data/train.csv")

In [3]:
# 데이터프레임 상위 5행 출력
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [4]:
# 데이터프레임 크기
df.shape

(17307, 3)

In [5]:
# score(점수) 점수 개수 확인
df['score'].value_counts()

3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: score, dtype: int64

In [6]:
# 필요없는 id 컬럼 삭제
df.drop('essay_id', axis = 1, inplace = True)

In [7]:
# 문자열 길이 확인
df['full_text'].str.len().describe()

count    17307.000000
mean      2071.617265
std        925.910701
min        712.000000
25%       1397.000000
50%       1924.000000
75%       2541.000000
max      20459.000000
Name: full_text, dtype: float64

In [8]:
# 문자열 전처리를 위한 패키지 로드
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk, re, contractions
from nltk.corpus import stopwords as stw

2024-04-26 21:43:50.118580: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 21:43:50.118603: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 21:43:50.119237: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 21:43:50.122858: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# 불용어 다운로드
nltk.download("stopwords")
stopwords = list(stw.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yjg1005/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# 약어, 이메일주소, 태그문등 불필요한 데이터 제거
reviews = []

for i in df['full_text'].values:
    # 불용어 처리
    source = " ".join([w for w in i.split() if w not in stopwords])

    # 약어 처리
    source = contractions.fix(source)

    # email 제거
    source = re.sub(
            r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", "", source
        )

    # html 태그 제거
    source = re.sub(r"<[^>]*>", "", source)

    # url 제거
    source = re.sub(
            r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            "",
            source,
        )

    # 숫자 제거
    source = re.sub(r"\b[0-9]+\b", "", source)

    # 특수문자 제거
    x = re.sub(r"[^\w ]+", "", source)
    source = " ".join(x.split())
    
    reviews.append(source)

In [11]:
df['pp_text'] = reviews
df.head()

Unnamed: 0,full_text,score,pp_text
0,Many people have car where they live. The thin...,3,Many people car live The thing know use car al...
1,I am a scientist at NASA that is discussing th...,3,I scientist NASA discussing face mars I explai...
2,People always wish they had the same technolog...,4,People always wish technology seen movies best...
3,"We all heard about Venus, the planet without a...",4,We heard Venus planet without almost oxygen ea...
4,"Dear, State Senator\n\nThis is a letter to arg...",3,Dear State Senator This letter argue favor kee...


In [12]:
X = df['pp_text']
Y = df['score']

In [13]:
# 텍스트 토큰화
tokenizer = Tokenizer(oov_token = '<OOV>')
tokenizer.fit_on_texts(X)
print(f"전체 단어 수 : {len(tokenizer.word_index) + 1}")

전체 단어 수 : 75336


In [14]:
token_set = tokenizer.texts_to_sequences(X)
print("토큰의 크기 :", len(token_set))

토큰의 크기 : 17307


In [15]:
# 단어 빈도 확인
bins = tokenizer.word_counts.items()
pd.DataFrame(bins, columns = ['word', 'counts']).sort_values('counts', ascending = False).head(10)

Unnamed: 0,word,counts
208,would,46107
4,the,44989
92,cars,43603
1,people,38898
2,car,37527
97,i,35177
402,venus,34968
194,could,30035
10,like,24165
137,it,23200


In [16]:
# 사용 빈도가 높다고 판단할 등장 회수
threshold = 30

total_cnt = len(tokenizer.word_index)
rare_cnt = 0
total_freq = 0
rare_freq = 0

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value
    if value < threshold:
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print("단어 집합(vocabulary)의 크기 :", total_cnt)
print(f"등장 빈도가 {threshold}번 미만인 희귀 단어의 수: {rare_cnt}")
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt) * 100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq) * 100)

# 자주 등장하는 단어 집합의 크기
vocab_size = total_cnt - rare_cnt + 2
print("단어 집합의 크기 :", vocab_size)

단어 집합(vocabulary)의 크기 : 75335
등장 빈도가 30번 미만인 희귀 단어의 수: 70029
단어 집합에서 희귀 단어의 비율: 92.9567929913055
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 5.193031112076686
단어 집합의 크기 : 5308


In [17]:
# 30회 이상 등장하는 단어만 이용하여 재토큰화 진행
tokenizer2 = Tokenizer(oov_token = '<OOV>', num_words = vocab_size)
tokenizer2.fit_on_texts(X)
print(f"전체 단어수: {len(tokenizer2.word_index) + 1}")

token_set = tokenizer2.texts_to_sequences(X)
print('토큰의 크기 :', len(token_set))

전체 단어수: 75336
토큰의 크기 : 17307


In [18]:
# 문장별로 몇개의 단어를 포함하고 있는지 확인
word_counts = []

for s in token_set:
    word_counts.append(len(s))

count_df = pd.DataFrame({"count": word_counts})
count_df.head()

Unnamed: 0,count
0,256
1,161
2,289
3,266
4,206


In [19]:
# 가장 많은 단어를 사용한 리뷰와 가장 적은 단어를 사용한 리뷰
max_word_count = max(word_counts)
min_word_count = min(word_counts)
print("가장 많은 단어를 사용하는 문장의 단어 수 :", max_word_count)
print("가장 적은 단어를 사용하는 문장의 단어수 :", min_word_count)

가장 많은 단어를 사용하는 문장의 단어 수 : 984
가장 적은 단어를 사용하는 문장의 단어수 : 62


In [20]:
# 패딩
max_word_count = max(word_counts)
# max_word_count = 200
pad_token_set = pad_sequences(token_set, maxlen=max_word_count)

In [21]:
# 데이터 분할
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(pad_token_set, Y, test_size = 0.2, random_state = 0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(13845, 984) (3462, 984) (13845,) (3462,)


In [22]:
# 케라스 학습을 위한 패키지 로드
import keras
from keras.layers import LSTM, GRU, Dense, Embedding, Dropout
from keras.optimizers import Adam, Nadam, RMSprop
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential

In [23]:
# 종속변수 원핫인코딩
# from sklearn.preprocessing import OneHotEncoder
# oh = OneHotEncoder(sparse_output = False)
# y_train_oh = oh.fit_transform(y_train.to_numpy().reshape(-1, 1))
# y_test_oh = oh.fit_transform(y_test.to_numpy().reshape(-1, 1))
# y_train_oh

In [24]:
class WeightedKappa(keras.metrics.Metric):
    def __init__(self, num_classes=6, epsilon=1e-6):
        super().__init__(name="weighted_kappa")
        self.num_classes = num_classes
        self.epsilon = epsilon

        label_vec = keras.ops.arange(num_classes, dtype=keras.backend.floatx())
        self.row_label_vec = keras.ops.reshape(label_vec, [1, num_classes])
        self.col_label_vec = keras.ops.reshape(label_vec, [num_classes, 1])
        col_mat = keras.ops.tile(self.col_label_vec, [1, num_classes])
        row_mat = keras.ops.tile(self.row_label_vec, [num_classes, 1])
        self.weight_mat = (col_mat - row_mat) ** 2

        self.numerator = self.add_weight(name="numerator", initializer="zeros")
        self.denominator = self.add_weight(name="denominator", initializer="zeros")

    def update_state(self, y_true, y_pred, **args):
        # revert ordinal regression labels to classification labels
        y_true = keras.ops.one_hot(keras.ops.sum(y_true, axis=-1) - 1, 6)
        y_pred = keras.ops.one_hot(
            keras.ops.sum(keras.ops.cast(y_pred > 0.5, dtype="int8"), axis=-1) - 1, 6
        )
        # weighted kappa calculation
        y_true = keras.ops.cast(y_true, dtype=self.col_label_vec.dtype)
        y_pred = keras.ops.cast(y_pred, dtype=self.weight_mat.dtype)
        batch_size = keras.ops.shape(y_true)[0]

        cat_labels = keras.ops.matmul(y_true, self.col_label_vec)
        cat_label_mat = keras.ops.tile(cat_labels, [1, self.num_classes])
        row_label_mat = keras.ops.tile(self.row_label_vec, [batch_size, 1])

        weight = (cat_label_mat - row_label_mat) ** 2

        self.numerator.assign_add(keras.ops.sum(weight * y_pred))
        label_dist = keras.ops.sum(y_true, axis=0, keepdims=True)
        pred_dist = keras.ops.sum(y_pred, axis=0, keepdims=True)
        w_pred_dist = keras.ops.matmul(
            self.weight_mat, keras.ops.transpose(pred_dist, [1, 0])
        )
        self.denominator.assign_add(
            keras.ops.sum(keras.ops.matmul(label_dist, w_pred_dist))
        )

    def result(self):
        return 1.0 - keras.ops.divide_no_nan(self.numerator, self.denominator)

    def reset_state(self):
        self.numerator.assign(0)
        self.denominator.assign(0)

In [25]:
# LSTM - 회귀모델
model = Sequential([
    Embedding(input_dim = vocab_size, output_dim = 64),
    LSTM(units = 64, return_sequences = True, dropout = 0.2, recurrent_dropout = 0.2),
    LSTM(32),
    Dense(1, activation = 'linear')
])

model.compile(optimizer = Adam(), loss = 'mae', metrics = ['mse', WeightedKappa()])

2024-04-26 21:44:03.686201: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22433 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:08:00.0, compute capability: 8.6
2024-04-26 21:44:03.918989: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [26]:
history = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 1000, callbacks = [
    EarlyStopping(patience = 8, restore_best_weights = True),
    ReduceLROnPlateau(patience = 4, factor = 0.1, min_lr = 1e-06)
])

Epoch 1/1000


2024-04-26 21:44:06.777610: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902


[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 654ms/step - loss: 1.0421 - mse: 2.0381 - weighted_kappa: 0.9694 - val_loss: 0.8055 - val_mse: 1.1210 - val_weighted_kappa: 0.9687 - learning_rate: 0.0010
Epoch 2/1000
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 657ms/step - loss: 0.7848 - mse: 1.0811 - weighted_kappa: 0.9687 - val_loss: 0.7986 - val_mse: 1.1078 - val_weighted_kappa: 0.9687 - learning_rate: 0.0010
Epoch 3/1000
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 655ms/step - loss: 0.7729 - mse: 0.9981 - weighted_kappa: 0.9687 - val_loss: 0.7904 - val_mse: 1.0248 - val_weighted_kappa: 0.9687 - learning_rate: 0.0010
Epoch 4/1000
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 655ms/step - loss: 0.6601 - mse: 0.7394 - weighted_kappa: 0.9687 - val_loss: 0.5818 - val_mse: 0.6411 - val_weighted_kappa: 0.9687 - learning_rate: 0.0010
Epoch 5/1000
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [27]:
test = pd.read_csv("./data/test.csv")

In [28]:
test.head()

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [29]:
# 약어, 이메일주소, 태그문등 불필요한 데이터 제거
test_reviews = []

for i in test['full_text'].values:
    # 불용어 처리
    source = " ".join([w for w in i.split() if w not in stopwords])

    # 약어 처리
    source = contractions.fix(source)

    # email 제거
    source = re.sub(
            r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b", "", source
        )

    # html 태그 제거
    source = re.sub(r"<[^>]*>", "", source)

    # url 제거
    source = re.sub(
            r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            "",
            source,
        )

    # 숫자 제거
    source = re.sub(r"\b[0-9]+\b", "", source)

    # 특수문자 제거
    x = re.sub(r"[^\w ]+", "", source)
    source = " ".join(x.split())
    
    test_reviews.append(source)

In [30]:
test['pp_text'] = test_reviews
test.head()

Unnamed: 0,essay_id,full_text,pp_text
0,000d118,Many people have car where they live. The thin...,Many people car live The thing know use car al...
1,000fe60,I am a scientist at NASA that is discussing th...,I scientist NASA discussing face mars I explai...
2,001ab80,People always wish they had the same technolog...,People always wish technology seen movies best...


In [31]:
test_X = test['pp_text']

In [32]:
test_token_set = tokenizer2.texts_to_sequences(test_X)

In [33]:
# 패딩
# max_word_count = 200
test_pad_token_set = pad_sequences(test_token_set, maxlen=max_word_count)

In [36]:
pred = model.predict(test_pad_token_set)
pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step


array([[2.9913626],
       [3.0068352],
       [4.0231557]], dtype=float32)

In [48]:
submit = pd.DataFrame()
submit['essay_id'] = test['essay_id']
submit['score'] = pred.flatten()

In [51]:
submit.to_csv("./submission.csv", index = False)