In [29]:
import shutil
from shutil import get_terminal_size
import os
import kagglehub
import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex
from textblob import TextBlob


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import joblib

In [18]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

## Data Load

In [20]:

goemotion_df1 = pd.read_csv(r"C:\Users\kakao\Desktop\Ajou_SocialNetworkAnalysis\Project\goemotion_dataset\goemotions_1.csv")
test_df = goemotion_df1.head().copy()

In [9]:
test_df

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


## Data Preprocessing

In [14]:
def preprocess_text(text):
    # 1. 소문자 변환
    text = text.lower()
    
    # 2. 특수 문자, 숫자 제거
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. 토큰화
    tokens = word_tokenize(text)
    
    # 4. 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatization (원형 복원)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # 전처리된 텍스트 반환
    return ' '.join(tokens)

In [30]:
# TF-IDF 벡터라이저 학습
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(goemotion_df1['text'])

# SVD 모델 학습
svd = TruncatedSVD(n_components=10, random_state=42)
svd.fit(tfidf_matrix)

In [27]:
def tfidf_svd_single(text, vectorizer, svd):
    """
    단일 텍스트 데이터를 TF-IDF로 벡터화하고 SVD로 축소.
    
    Args:
        text (str): 입력 텍스트.
        vectorizer (TfidfVectorizer): 학습된 TF-IDF 벡터라이저.
        svd (TruncatedSVD): 학습된 SVD 모델.
        
    Returns:
        np.ndarray: 축소된 벡터 (1D 배열).
    """
    # TF-IDF 벡터화
    tfidf_vector = vectorizer.transform([text])
    
    # SVD로 차원 축소
    reduced_vector = svd.transform(tfidf_vector)
    
    return reduced_vector.flatten()  # 1D 배열 반환


In [31]:
goemotion_df1["preprocessed_text"] = goemotion_df1['text'].apply(preprocess_text)
goemotion_df1['svd_vector'] = goemotion_df1['text'].apply(lambda x: tfidf_svd_single(x, vectorizer, svd))
goemotion_df1

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,preprocessed_text,svd_vector
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,1,0,0,game hurt,"[0.08387483325477987, 0.014886744976617985, -0..."
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1.548084e+09,37,True,0,...,0,0,0,0,0,0,0,0,sexuality shouldnt grouping category make diff...,"[0.12284000948965618, -0.04418034489179539, -0..."
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,1,right dont care fuck em,"[0.18718530933998412, -0.2473648637472333, 0.1..."
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,0,0,0,0,0,0,0,0,man love reddit,"[0.056482091126149894, 0.01767376818534173, 0...."
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,1,name nowhere near falcon,"[0.15079131403706952, 0.16562737596079893, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,"It's about fucking time, hope this is real.",eeuoyeh,DudeImMacGyver,worldnews,t3_ajcbgq,t3_ajcbgq,1.548346e+09,19,False,0,...,0,0,0,0,0,0,0,0,fucking time hope real,"[0.20345612882692668, 0.03370088341390846, -0...."
69996,This is great! Can anyone make a request with ...,eer0igi,Dirkus777,gay,t3_aiqhx1,t3_aiqhx1,1.548223e+09,55,False,1,...,0,0,0,0,0,0,0,0,great anyone make request draw,"[0.19568536445339288, -0.0693234833538599, 0.0..."
69997,I’m sorry. Can you please explain what are the...,eebxspf,menjav,DebateAnAtheist,t3_ah451r,t1_eeblb39,1.547787e+09,5,False,0,...,0,0,0,0,0,0,0,0,im sorry please explain account order creation...,"[0.1880127966663004, -0.07513818197078902, 0.0..."
69998,No but it should be,edjf4v2,heputmystuffinjello,DunderMifflin,t3_adpkeq,t1_edjasoe,1.546922e+09,51,False,0,...,0,0,0,0,0,0,0,1,,"[0.2062024650040501, -0.017941736194403396, -0..."


In [32]:
X = np.stack(goemotion_df1['svd_vector'])
y = goemotion_df1['surprise']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

KeyboardInterrupt: 