## 구글 드라이브 연결

In [35]:
import pandas as pd
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 형태소분석기 설치

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk
!pip install JPype1
!pip install rhinoMorph

## 경로 변경

In [37]:
%cd /content/gdrive/MyDrive/pytest/

/content/gdrive/MyDrive/pytest


In [38]:
!ls

네이버뉴스-생활문화_다중  iris.csv	pytest_position.png  test.csv
aclImdb_v1_small	  kor-eng	ratings_morphed.txt  김소월시.txt
alice.png		  negative.txt	ratings_small.txt    윤동주시.txt
fra-eng			  positive.txt	similarity	     wiki_test.txt


## 데이터 로딩

In [39]:
def read_data(filename, encoding='cp949'): # 읽기 함수 정의
    with open(filename, 'r', encoding=encoding) as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] # txt 파일의 헤더(id document label)는 제외하기
    return data
def write_data(data, filename, encoding='cp949'): # 쓰기 함수도 정의
    with open(filename, 'w', encoding=encoding) as f:
        f.write(data)

data = read_data('ratings_small.txt') 

In [40]:
import rhinoMorph
rn = rhinoMorph.startRhino()
morphed_data = ''
for data_each in data:
    morphed_data_each = rhinoMorph.onlyMorph_list(rn, data_each[1],
        pos=['NNG', 'NNP', 'VV', 'VA', 'XR', 'IC', 'MM', 'MAG', 'MAJ'])
    joined_data_each = ' '.join(morphed_data_each) # 문자열을 하나로 연결
    if joined_data_each: # 내용이 있는 경우만 저장하게 함
        morphed_data += data_each[0]+"\t"+joined_data_each+"\t"+data_each[2]+"\n"
# 형태소 분석된 파일 저장
write_data(morphed_data, 'ratings_morphed.txt', encoding='cp949')

filepath:  /usr/local/lib/python3.7/dist-packages
classpath:  /usr/local/lib/python3.7/dist-packages/rhinoMorph/lib/rhino.jar
JVM is already started~
RHINO started!


## 데이터 확인

In [41]:
data = read_data('ratings_morphed.txt' , encoding='cp949')
print(len(data))
print(len(data[0])) 
print(data[0])

494
3
['8132799', '디자인 배우 학생 외국 디자이너 일구 전통 통하 발전 문화 산업 부럽 사실 우리나라 그 어렵 시절 끝 열정 지키 노라노 같 전통 있 같 사람 꿈 꾸 이루 나가 있 감사', '1']


## 감정사전 읽기

In [42]:
data_id = [line[0] for line in data]
data_text = [line[1] for line in data]
data_senti = [line[2] for line in data]
print(data_id)
print(data_text)
print(data_senti)

positive = read_data('positive.txt')
negative = read_data('negative.txt')
print(positive)
print(negative)

pos_found = []
neg_found = []

['8132799', '4655635', '9251303', '10067386', '2190435', '9279041', '7865729', '7477618', '9250537', '9730759', '640794', '9537008', '4911311', '6686673', '9034036', '979683', '165498', '8703997', '9468781', '5185638', '10221267', '486781', '7776793', '9694764', '10232169', '8515083', '9758264', '9281669', '7517650', '4339983', '10225238', '7295706', '5161286', '8906022', '8323152', '10124911', '9852390', '1421412', '6809191', '5717439', '8824828', '5247378', '8719068', '1111967', '10142929', '10050104', '10082134', '6825477', '10272889', '8715095', '87993', '3454102', '8667840', '8619006', '1923466', '9985813', '10151722', '1516658', '8255656', '4469803', '6212210', '1364440', '9049119', '2549508', '3017342', '6784667', '9611384', '10092383', '5784847', '9421346', '9814476', '9334812', '8377531', '2736470', '9637948', '1987338', '9668399', '9247479', '8499726', '8620641', '7360402', '9384295', '8550276', '9339372', '8095702', '3812917', '8418558', '7933171', '9097219', '9984439', '460

## 감정단어 파악

In [54]:
def cntWordInLine(data, senti):
    senti_found = []
    for onedata in data:
        oneline_word = onedata.split(' ')
        senti_temp = 0
        for sentiword in senti:
            if sentiword[0] in oneline_word:
                senti_temp += 1
        senti_found.append(senti_temp)
    return senti_found
data_senti_poscnt = cntWordInLine(data_text, positive)
data_senti_negnet = cntWordInLine(data_text, negative)

print(data_senti_poscnt[:20])
print(data_senti_negnet[:20])

[5, 1, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0]
[1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1]


## 감정점수 계산

In [59]:
import pandas as pd
newdata = pd.DataFrame({'id':data_id, 'text': data_text, 'original':data_senti,
                        'pos': data_senti_poscnt, 'neg':data_senti_negnet})
senti_score = newdata['pos'] - newdata['neg']
newdata['senti_score'] = senti_score

newdata.loc[newdata.senti_score > 0, 'new'] = 1
newdata.loc[newdata.senti_score <= 0, 'new'] = 0

newdata.info()

# 처음에 기록된 긍부정과 새로 계산된 긍부정이 같은지 여부를 matched 컬럼에 저장
# original 컬럼은 문자로 되어 있으므로 숫자로 변환 뒤 비교
newdata.loc[pd.to_numeric(newdata.original) == newdata.new, 'matched'] = 'True'
newdata.loc[pd.to_numeric(newdata.original) != newdata.new, 'matched'] = 'False'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           494 non-null    object 
 1   text         494 non-null    object 
 2   original     494 non-null    object 
 3   pos          494 non-null    int64  
 4   neg          494 non-null    int64  
 5   senti_score  494 non-null    int64  
 6   new          494 non-null    float64
dtypes: float64(1), int64(3), object(3)
memory usage: 27.1+ KB


## 원점수와 비교 및 저장

In [61]:
score = newdata.matched.str.count('True').sum() / (newdata.matched.str.count('True').sum() 
+newdata.matched.str.count('False').sum()) * 100
print(score)
newdata.to_csv('newfile.csv', sep=',', encoding='cp949', index=False) # csv 저장
newdata.to_csv('newfile2.txt', sep='\t', encoding='cp949', index=False) # 또는 txt 저장

65.58704453441295


## 시그모이드 점수 계산

In [62]:
import math
def sigmoid(x):
    return 1 / (1 + math.exp(-x))
newdata['sigmoid'] = newdata.senti_score.apply(sigmoid)

## 결과 확인

In [63]:
newdata.head()

Unnamed: 0,id,text,original,pos,neg,senti_score,new,matched,sigmoid
0,8132799,디자인 배우 학생 외국 디자이너 일구 전통 통하 발전 문화 산업 부럽 사실 우리나라...,1,5,1,4,1.0,True,0.982014
1,4655635,폴리스스토리 시리즈 뉴 없 최고,1,1,0,1,1.0,True,0.731059
2,9251303,와 연기 진짜 쩔 지루 생각하 몰입 그래 이런 진짜 영화,1,0,1,-1,0.0,False,0.268941
3,10067386,안개 자욱 하 밤하늘 뜨 초승달 같 영화,1,0,0,0,0.0,False,0.5
4,2190435,사랑 해보 사람 처음 끝 웃 있 영화,1,2,0,2,1.0,True,0.880797
