In [1]:
#모듈 임포트
import pandas as pd
import numpy as np 
import os 
import re
from tqdm import tqdm
import warnings
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tqdm.pandas()

In [2]:
#필요 모델 다운로드 및 파이프라인 생성
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

twitter_nlp = pipeline('sentiment-analysis', model = model, tokenizer = tokenizer, top_k = None)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#필요 함수 생성
#트위터 텍스트 전처리 함수
def preprocess(text):
    text = str(text).replace('\n', ' ')
    new_text = []
    for t in str(text).split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

#예외처리를 위한 트위터 감성 분석 함수
def sentiment_analysis(text):
    try:
        return twitter_nlp(text)
    except Exception as e:
        print(e)
        pass
    
#감성 분석 결과 반환 함수
def labelling(full_result):
    positive_score = 0
    neutral_score = 0
    negative_score = 0
    for i in range(3):
        if full_result[0][i]['label'] == 'positive': positive_score = full_result[0][i]['score']
        elif full_result[0][i]['label'] == 'neutral': neutral_score = full_result[0][i]['score']
        else: negative_score = full_result[0][i]['score']
    highest_score = max(positive_score, neutral_score, negative_score)
    
    for i in range(3):
        if full_result[0][i]['score'] == highest_score: max_label = full_result[0][i]['label']
    return [positive_score, neutral_score, negative_score, max_label]

In [4]:
#파일 경로 설정
#자기 경우에 맞게 파일 읽어들일 디렉토리랑 저장할 디렉토리 설정하기
file_path = '/Users/parkjunhyeong/Desktop/박준형/02. 대내 및 대외활동/01. 대내활동/03. Biz&AI 랩/02. 소스/01. 데이터/03. 트위터 데이터/02. Selected Data'
save_path = '/Users/parkjunhyeong/Desktop/박준형/02. 대내 및 대외활동/01. 대내활동/03. Biz&AI 랩/02. 소스/01. 데이터/03. 트위터 데이터/02. Selected Data'

In [13]:
#기업 리스트 설정
#하나당 시간이 꽤 걸리므로 한번 돌릴때 한개에서 두개 정도 돌리는 거 추천 (하나에 6시간 ~ 7시간 걸림)
#아래 예시
short_list = ['JBLU.csv']

In [None]:
#50만개 안 넘는 거 감성 분석하기
for s in short_list:
    df = pd.read_csv(file_path + '/' + s,index_col = 0, lineterminator = '\n')
    df['clean_content'] = df['content'].apply(lambda x : preprocess(x))
    df['sentiment'] = df['clean_content'].progress_apply(lambda x : sentiment_analysis(x))
    df.dropna(inplace = True)
    df.reset_index(drop = True, inplace = True)
    #negative, neutral, positive 칼럼 생성하기
    for i in range(len(df)):
        labelling_results = labelling(df.loc[i, 'sentiment'])
        df.loc[i,'positive'] = labelling_results[0]
        df.loc[i,'neutral'] = labelling_results[1]
        df.loc[i,'negative'] = labelling_results[2]
        df.loc[i,'sentiment'] = labelling_results[3]
    #sentiment score 칼럼 생성하기
    df['sentiment_score'] = df['positive'] - df['negative']
    df = df[['datetime', 'username', 'content', 'clean_content', 'sentiment', 'sentiment_score', 'positive', 'neutral', 'negative']]
    df.to_csv(save_path + '/' + s)