In [6]:
import re
from konlpy.tag import Okt, Komoran
import pandas as pd
from tqdm import tqdm
import os


def clean_text(text):
    pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'  # 한글 자음, 모음 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '([^ ㄱ-힣]+)'  # 한글 외 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '<[^>]*>'         # HTML 태그 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '[^\w\s\n]'         # 특수기호제거
    text = re.sub(pattern=pattern, repl='', string=text)
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', string=text)
    text = re.sub('\n', ' ', string=text)
    return text 

def clean_stopword(d):
    return ' '.join([w for w in d.split() if w not in stopwords and len(w) > 3])

def preprocessiong(text):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', } 
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text.strip()

okt = Okt()
komoran = Komoran()

def extract_nouns_from_sentence(sentence):
    sentence = sentence.strip() 
    sentence = ''.join(ch for ch in sentence if ch.isalnum() or ch.isspace())  # Remove special characters
    sentence = sentence.lower()

    nouns = komoran.nouns(sentence) 
    return nouns

def extract_nouns_from_comments(comments):
    nouns_list = []
    for comment in comments:
        cleaned_comment = clean_text(comment)
        preprocessed_comment = preprocessiong(cleaned_comment)
        nouns = extract_nouns_from_sentence(preprocessed_comment)
        if nouns:
            nouns_list.append(' '.join(nouns))
        else:
            nouns_list.append(None)  
    return nouns_list

stopwords = []  # Add your stopwords here if needed

uploaded_file_path = "crawl_Janssen_extract_Sentiment_pos_data.csv"
output_file_path = os.path.splitext(uploaded_file_path)[0] + "_noun.csv"

data = pd.read_csv(uploaded_file_path)

nouns_list = extract_nouns_from_comments(data['댓글 내용'])

data['추출된 명사'] = nouns_list

nouns_data = pd.DataFrame({'추출된 명사': nouns_list})

nouns_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print("명사 추출이 완료되었습니다.")


명사 추출이 완료되었습니다.


In [None]:
import pandas as pd
import re

# CSV 파일 로드
df = pd.read_csv('vaccine_extract_Sentiment_noun_vaccine.csv')

# 백신별로 단어 단위 키워드 추출 및 그룹화
keyword_counts = {}
for index, row in df.iterrows():
    vaccine = row['백신']
    keywords = re.findall(r'\b\w+\b', str(row['추출된 명사']))  # 문자열로 변환하여 단어 단위로 추출
    for keyword in keywords:
        if len(keyword) > 1:  # 한 글자인 키워드 제외
            if keyword not in keyword_counts:
                keyword_counts[keyword] = {}
            if vaccine not in keyword_counts[keyword]:
                keyword_counts[keyword][vaccine] = 0
            keyword_counts[keyword][vaccine] += 1

# 새로운 CSV 파일로 저장
result = pd.DataFrame(keyword_counts).fillna(0).T.reset_index().rename(columns={'index': '키워드'})
result.to_csv('keyword_counts.csv', index=False, encoding='utf-8-sig')