In [1]:
import shutil
from shutil import get_terminal_size
import os
import kagglehub
import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex
from textblob import TextBlob


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

## Data Load

In [3]:

goemotion_df1 = pd.read_csv(r"C:\Users\kakao\Desktop\Ajou_SocialNetworkAnalysis\Project\goemotion_dataset\goemotions_1.csv")
goemotion_df2 = pd.read_csv(r"C:\Users\kakao\Desktop\Ajou_SocialNetworkAnalysis\Project\goemotion_dataset\goemotions_2.csv")
goemotion_df3 = pd.read_csv(r"C:\Users\kakao\Desktop\Ajou_SocialNetworkAnalysis\Project\goemotion_dataset\goemotions_3.csv")

test_df = goemotion_df1.head().copy()

In [4]:
df = pd.concat([goemotion_df1, goemotion_df2, goemotion_df3], axis=0, ignore_index=True)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-null  int64  
 15  

In [11]:
df = df[df['example_very_unclear']==False]

df = df.dropna(subset=['text'])

df = df.reset_index(drop=True)

In [12]:
df

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,0,0,1
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,1,0,0,0,0,0,0,0,0,0
3,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,0,0,1
4,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,t3_aizyuz,t1_eesoak0,1.548280e+09,61,False,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207809,"Well, I'm glad you're out of all that now. How...",ed89acy,pompompompi,raisedbynarcissists,t3_ac9etw,t1_ed7gt8x,1.546615e+09,2,False,0,...,0,0,0,0,0,0,0,0,0,0
207810,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,t3_agjf24,t3_agjf24,1.547634e+09,16,False,0,...,1,0,0,0,0,0,0,0,0,0
207811,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,t3_ak26t3,t3_ak26t3,1.548553e+09,15,False,0,...,0,0,0,0,0,0,0,0,0,0
207812,That looks amazing,ee8hse1,springt1me,shittyfoodporn,t3_agrnqb,t3_agrnqb,1.547684e+09,70,False,1,...,0,0,0,0,0,0,0,0,0,0


## Data Preprocessing

In [13]:
def preprocess_text(text):
    # 1. 소문자 변환
    text = text.lower()
    
    # 2. 특수 문자, 숫자 제거
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. 토큰화
    tokens = word_tokenize(text)
    
    # 4. 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatization (원형 복원)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # 전처리된 텍스트 반환
    return ' '.join(tokens)

In [None]:
# TF-IDF 벡터라이저 학습
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])

# SVD 모델 학습
# n_components split test
svd = TruncatedSVD(n_components=10, random_state=42)
svd.fit(tfidf_matrix)

In [15]:
def tfidf_svd_single(text, vectorizer, svd):

    # TF-IDF 벡터화
    tfidf_vector = vectorizer.transform([text])
    
    # SVD로 차원 축소
    reduced_vector = svd.transform(tfidf_vector)
    
    return reduced_vector.flatten()  # 1D 배열 반환


In [16]:
df["preprocessed_text"] = df['text'].apply(preprocess_text)
df['svd_vector'] = df['text'].apply(lambda x: tfidf_svd_single(x, vectorizer, svd))
df

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,preprocessed_text,svd_vector
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1.548381e+09,1,False,0,...,0,0,0,0,0,1,0,0,game hurt,"[0.0834293531215238, 0.010230818243285408, -0...."
1,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1.546428e+09,37,False,0,...,0,0,0,0,0,0,0,1,right dont care fuck em,"[0.1858029389653627, -0.23254877221503895, 0.1..."
2,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1.547965e+09,18,False,0,...,0,0,0,0,0,0,0,0,man love reddit,"[0.056189664101389085, 0.017384332733256846, 0..."
3,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1.546669e+09,2,False,0,...,0,0,0,0,0,0,0,1,name nowhere near falcon,"[0.1483880435945849, 0.1645028105966041, 0.022..."
4,Right? Considering it’s such an important docu...,eespn2i,ImperialBoss,TrueReddit,t3_aizyuz,t1_eesoak0,1.548280e+09,61,False,0,...,0,0,0,0,0,0,0,0,right considering important document know damn...,"[0.15713907827005, -0.007080768598668069, -0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207809,"Well, I'm glad you're out of all that now. How...",ed89acy,pompompompi,raisedbynarcissists,t3_ac9etw,t1_ed7gt8x,1.546615e+09,2,False,0,...,0,0,0,0,0,0,0,0,well im glad youre awful way act make think he...,"[0.268078462960357, -0.1947677552589192, 0.115..."
207810,Everyone likes [NAME].,ee6pagw,Senshado,heroesofthestorm,t3_agjf24,t3_agjf24,1.547634e+09,16,False,0,...,0,0,0,0,0,0,0,0,everyone like name,"[0.0846241148925653, 0.21719215119991175, 0.17..."
207811,Well when you’ve imported about a gazillion of...,ef28nod,5inchloser,nottheonion,t3_ak26t3,t3_ak26t3,1.548553e+09,15,False,0,...,0,0,0,0,0,0,0,0,well youve imported gazillion country get serious,"[0.14859030455633934, -0.08444960050670886, 0...."
207812,That looks amazing,ee8hse1,springt1me,shittyfoodporn,t3_agrnqb,t3_agrnqb,1.547684e+09,70,False,1,...,0,0,0,0,0,0,0,0,look amazing,"[0.09221560609224827, 0.018146999422079375, -0..."


In [32]:
X = np.stack(goemotion_df1['svd_vector'])
y = goemotion_df1['surprise']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)