In [35]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys, re, json
from konlpy.tag import Okt 
from collections import Counter 

import warnings
warnings.filterwarnings('ignore')

# load raw data from csv file
source=pd.read_csv('반려동물_5.csv')

# load stop words
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()

#load senti words
sentiwordsfile = open('SentiWord_info.json', encoding='utf-8-sig', mode='r')
sentiwords = json.load(sentiwordsfile)

# 명사 형태소 추출 함수
okt = Okt()  

#define functions
def applyRegularExpression(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')  # 한글 추출 규칙: 띄어 쓰기(1 개)를 포함한 한글
    result = hangul.sub('', text)  # 위에 설정한 "hangul"규칙을 "text"에 적용(.sub)시킴
    return result

def countWords(_data, range):
    if range is None:
        range = len(_data)
    
    data = _data['text'].dropna(axis=0)

    corpus = "".join(data)
    # print('띄어쓰기 없앰', corpus)

    # 문장부호 없애기
    nouns = okt.nouns(applyRegularExpression(corpus))
    #print('문장부호 없앰', nouns)

    no_stop_words = [x for x in nouns if x not in stopwords if len(x) > 1]
    result = Counter(no_stop_words[0:range])
    return result

def countWordsByReview(reviews, range):
    if range is None:
        range = len(reviews)
    for row in data[0:range]:
        result = Counter(row)
    return result

def getScoresFromReviews( rawData, range = None ):
    if range is None:
        range = len(rawData)
        
    # print(rawData[0:10])
    # 리뷰 없는 ㅇㅕㄹ 제거
    data = rawData[0:range].dropna(axis=0)
    
    scores = []
    
    # 리뷰별로 로직 적용
    for row in data['text'][0:range]:
        # row 는 리뷰 하나다
        # 띄어쓰기 없애기
        corpus = "".join(row)
        # print('띄어쓰기 없앰', corpus)

        # 문장부호 없애기
        nouns = okt.nouns(applyRegularExpression(corpus))
        #print('문장부호 없앰', nouns)

        no_stop_words = [x for x in nouns if x not in stopwords if len(x) > 1]
        #print('중성명사 없앰', no_stop_words)

        #리뷰별로 점수 초기화
        score = 0

        for word in no_stop_words:
            #각 단어의 점수 계산
            wordscore = getScoreByWord(word)
            # 리뷰 내 모든 단어의 점수 합산
            score += int(getScoreByWord(word))
        
        # add values to scores array(new column value)
        scores.append(score)
        
    data['score'] = scores
    
    return data
    
       # if(score != 0):
        # 0 아니면 로그찍기
        #전체리뷰보기
        #print(row + '...', score)
        # 앞 10글자만 보기
       # print(row[0:10] + '...', score)

def getScoreByWord(wordname):
    score = 0
    for i in range(0, len(sentiwords)):
        if wordname in sentiwords[i]['word_root']:
            result = sentiwords[i]['polarity']
            score = int(result)
            # print('점수 : ' + wordname + ' : ' + str(score))
            
    return score

In [32]:
# countWords(rawData, 200).most_common(10)    
# finalreviewscore=getScoresFromReviews(rawData, 100)

In [36]:
getScoresFromReviews(source).to_csv('./example.csv')

In [5]:
'''
from sklearn.feature_extraction.text import CountVectorizer

def text_cleaning(text):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')  # 정규 표현식 처리
    result = hangul.sub('', text)
    okt = Okt()  # 형태소 추출
    nouns = okt.nouns(result)
    nouns = [x for x in nouns if len(x) > 1]  # 한글자 키워드 제거
    nouns = [x for x in nouns if x not in stopwords]  # 불용어 제거
    return nouns

vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
bow_vect = vect.fit_transform(cleaned.tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)
'''

"\nfrom sklearn.feature_extraction.text import CountVectorizer\n\ndef text_cleaning(text):\n    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')  # 정규 표현식 처리\n    result = hangul.sub('', text)\n    okt = Okt()  # 형태소 추출\n    nouns = okt.nouns(result)\n    nouns = [x for x in nouns if len(x) > 1]  # 한글자 키워드 제거\n    nouns = [x for x in nouns if x not in stopwords]  # 불용어 제거\n    return nouns\n\nvect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))\nbow_vect = vect.fit_transform(cleaned.tolist())\nword_list = vect.get_feature_names()\ncount_list = bow_vect.toarray().sum(axis=0)\n"

In [6]:
'''from sklearn.feature_extraction.text import TfidfTransformer

#단어의 중요도 찾아내는 알고리즘
tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)
'''

'from sklearn.feature_extraction.text import TfidfTransformer\n\n#단어의 중요도 찾아내는 알고리즘\ntfidf_vectorizer = TfidfTransformer()\ntf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)\n'

In [7]:
#print(tf_idf_vect.shape)