In [1]:
import os
import sqlite3
import pandas as pd
import tqdm
import pickle

In [2]:
db_name = '[Full_TFIDF]Credibility_research_20180906.db' #DB 파일명
# db 생성
con = sqlite3.connect( db_name )
cur = con.cursor()

In [3]:
sql = '''
select p.Post_id,p.Category,p.Title,p.Text,u.Credibility,group_concat(img.Img_id) as img_count
    FROM Post as p
    Left JOIN user as u
        ON p.User_id = u.User_id
    Left JOIN Img as img
        ON p.Post_id = img.Post_id
    GROUP BY p.Post_id;
    '''
cur.execute(sql)
rows = cur.fetchall()

In [4]:
col_name = [i[0] for i in cur.description]

In [5]:
test_df = pd.DataFrame(rows)

In [6]:
test_df.columns = col_name

In [7]:
test_df.columns.tolist()

['Post_id', 'Category', 'Title', 'Text', 'Credibility', 'img_count']

In [8]:
test_df = test_df[test_df['Category'] != "무작위"].iloc[:16304]

In [9]:
# test_df = test_df[test_df['Credibility'] == 1]

In [10]:
test_df.to_csv('text_practice.csv',header=None)

In [11]:
test_df = test_df.dropna()

In [12]:
test_df['img_count'] = test_df['img_count'].apply(lambda x : 0 if type(x)==type(None) else x)
test_df['img_count'] = test_df['img_count'].apply(lambda x : len(x.split(',')) if x != 0 else x)
test_df['Text_len'] = test_df['Text'].apply(lambda x : len(x))

## 18개 Category to Numeric labeling

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(test_df['Category'])
le.classes_
test_df['Category'] =le.transform(test_df['Category'])
test_df = test_df.dropna()

In [14]:
Category = le.classes_
le.classes_

array(['IT·컴퓨터', '건강·의학', '공연·전시', '교육·학문', '국내여행', '드라마·방송', '등산·낚시·레저',
       '만화·애니', '맛집', '사진', '스포츠', '시사·인문·경제', '어학·외국어', '와인·술', '육아·결혼',
       '자동차', '차·커피·디저트', '패션·뷰티'], dtype=object)

In [15]:
from sklearn.utils import shuffle
test_df = shuffle(test_df)

In [16]:
Title = test_df['Title']
Text = test_df['Text']
y = test_df['Category']

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = TfidfVectorizer(analyzer='word', sublinear_tf=True,lowercase=True)
vectorizer_Text = TfidfVectorizer(max_features=2000)
tfidf_Text = vectorizer_Text.fit_transform(Text)

In [18]:
pickle.dump(vectorizer_Text,open("tfidf_text_.pkl","wb"))

In [19]:
v_load = pickle.load(open("tfidf_text.pkl", "rb"))

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = TfidfVectorizer(analyzer='word', sublinear_tf=True,lowercase=True)
vectorizer_Title = TfidfVectorizer(max_features=1000)
tfidf_Title = vectorizer_Title.fit_transform(Title)

In [23]:
vectorizer_Title.get_feature_names()

['01',
 '02',
 '04',
 '05',
 '06',
 '07',
 '08',
 '10',
 '100',
 '10월',
 '11',
 '12',
 '13',
 '14',
 '14년',
 '15',
 '16',
 '17',
 '18',
 '18년',
 '19',
 '1기',
 '1위',
 '1편',
 '1화',
 '20',
 '200mm',
 '2011',
 '2013',
 '2014',
 '2015',
 '2015년',
 '2016',
 '2016년',
 '2017',
 '2017년',
 '2018',
 '2018년',
 '2019',
 '22',
 '24',
 '24시',
 '24시간',
 '25',
 '26',
 '27',
 '28',
 '2기',
 '2편',
 '2화',
 '30',
 '365일',
 '3화',
 '4분기',
 '4월',
 '4차산업혁명',
 '5월',
 '5화',
 '6월',
 '70',
 '7월',
 '8월',
 'a100엔터테인먼트',
 'abonz',
 'all',
 'amg',
 'and',
 'angling',
 'aslan',
 'at',
 'bmw',
 'by',
 'ckcolor',
 'color',
 'comodo',
 'cos',
 'day',
 'de',
 'detailing',
 'dslr',
 'ef',
 'effects',
 'f1',
 'feat',
 'for',
 'gre',
 'hyundai',
 'ii',
 'in',
 'is',
 'it',
 'kbl',
 'lg',
 'live',
 'my',
 'new',
 'nikon',
 'no',
 'nx',
 'of',
 'ost',
 'paris',
 'part',
 'ready',
 'sports',
 'sports카페에서',
 'spring',
 'ssl',
 'story',
 'talk',
 'the',
 'to',
 'today',
 'tv',
 'up',
 'vol',
 'vs',
 'wash',
 'with',
 'wk',
 'wnba',

In [24]:
pickle.dump(vectorizer_Title,open("tfidf_title_.pkl","wb"))

In [25]:
v_load = pickle.load(open("tfidf_title_.pkl", "rb"))

In [None]:
v_load.transform

In [41]:
Text_len = test_df['Text_len'].reset_index(drop=True)
Img_cnt = test_df['img_count'].reset_index(drop=True)

In [42]:
Text_result = pd.DataFrame(tfidf_Text.toarray())
Title_result = pd.DataFrame(tfidf_Title.toarray())

In [43]:
x = pd.concat([Text_result,Title_result],axis=1)
y = y.reset_index(drop=True)

In [44]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x , y , test_size=0.1 , random_state=42)

In [45]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((13621, 3000), (13621,), (1514, 3000), (1514,))

# LogisticRegression

In [48]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
logreg = linear_model.LogisticRegression(C=2.0,random_state=42,solver='sag',multi_class='multinomial',warm_start=True)
logreg.fit(x_train, y_train)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='sag',
          tol=0.0001, verbose=0, warm_start=True)

In [49]:
import pickle
# model_save
model_name = 'Category_model.pkl'
pickle.dump(logreg, open(model_name, 'wb'))
# model_load
logreg = pickle.load(open(model_name, 'rb'))
accuracy_score(logreg.predict(x_test), y_test)
import pickle
logreg = pickle.load(open(model_name, 'rb'))
accuracy_score(logreg.predict(x_test), y_test)

0.8117569352708058

In [50]:
from sklearn.metrics import precision_score
precision_score(logreg.predict(x_test), y_test,average=None)

array([0.85869565, 0.86466165, 0.81318681, 0.83739837, 0.57142857,
       0.79365079, 0.30434783, 0.76363636, 0.92857143, 0.84393064,
       0.69333333, 0.609375  , 0.66666667, 0.61538462, 0.59259259,
       0.82608696, 0.87323944, 0.93142857])

In [51]:
from sklearn.metrics import classification_report
print(classification_report(y_test, logreg.predict(x_test)))

             precision    recall  f1-score   support

          0       0.79      0.86      0.82        92
          1       0.86      0.86      0.86       133
          2       0.83      0.81      0.82        91
          3       0.90      0.84      0.87       123
          4       0.72      0.57      0.64        63
          5       0.91      0.79      0.85        63
          6       0.64      0.30      0.41        23
          7       0.82      0.76      0.79        55
          8       0.84      0.93      0.88       154
          9       0.67      0.84      0.74       173
         10       0.68      0.69      0.69        75
         11       0.66      0.61      0.63        64
         12       0.86      0.67      0.75        27
         13       0.89      0.62      0.73        13
         14       0.84      0.59      0.70        27
         15       0.94      0.83      0.88        92
         16       0.94      0.87      0.91        71
         17       0.86      0.93      0.89   

# 실제 맛집 Test

In [52]:
class textclass:
    def Extract_structure_and_tag(User_id, Post_id):
        url = "http://blog.naver.com/PostView.nhn?blogId=" + User_id + "&logNo=" + Post_id + "&redirect=Dlog&widgetTypeCall=true"
        r = requests.get(url)
        bs = BeautifulSoup(re.sub('&nbsp;', ' ', r.text).encode("utf-8"), "html.parser")
        # structure
        structure = bs.find("div", {"id": "postViewArea"})
        if structure == None:
            structure = bs.find("div", {"class", "se_component_wrap sect_dsc __se_component_area"})

        structure_p_img_tag = structure.find_all(['p', 'img'])
        structure_dict = {'structure': structure, 'structure_p_img_tag': structure_p_img_tag}
        # structure_p_img_tag : p,img tag만 extract
        # structure : 모든 tag 가져오기
        return structure_dict

    # Extract_structure_and_tag 함수의 'structure_p_img_tag' 값을 가져와야함.

    def HTML_preprocessing(structure_p_img_tag):
        # only tag & text extract
        tag_list = []
        text_list = []
        for i in structure_p_img_tag:
            # p_tag만 불러오기
            if "<p" in (str(i)):
                tag_list.append('<p>')
                # img만 있을 때

                if '<img' in str(i):
                    for j in i:
                        try:
                            if len(j.text) > 1:
                                tag_list.append('<br>')
                                text_list.append(j.text)
                        except:
                            pass

                # img가 아닌 경우 span tag가 더 있을 때
                elif '<span' in str(i):
                    for j in i:
                        if '<br' in str(j):
                            text_list.append(j.text)
                            # br_tag가 2개 이상 있을 때

                            if len(j.findAll('br')) > 2:
                                for _ in range(0, len(j.findAll('br'))):
                                    tag_list.append('<br>')

                            # br_tag가 1개 있을 때
                            else:
                                tag_list.append('<br>')

                        # span은 있지만 br tag가 없을 때
                        else:
                            try:
                                text_list.append(j.text)
                            except:
                                pass

                # 그냥 p_tag만 있을 때 br_tag 추가
                else:
                    # 글이 있을 때
                    if len(i.text) > 1:
                        text_list.append(i.text)

                    # 글 없이 br tag만 있을 때
                    else:
                        tag_list.append('<br>')
                        text_list.append(i.text)

                # P_tag 끝맽음
                tag_list.append('</p>')

            else:
                tag_list.append('<img>')

        text_list = list(map(remove_odd, text_list))
        filter_text = list(filter(lambda x: len(x) > 1, text_list))

        Text = " ".join(list(filter(lambda x: len(x) > 1, map(lambda x: x.strip(), text_list))))
        Text = re.sub('\n', '', Text)
        Text = re.sub('\t', '', Text)
        Space_text = " ".join(Spacing_text(filter_text))
        Count_space_mistake = len(Space_text) - len(Text)

        # only tag
        Structure_only_tag = "|".join(tag_list)
        Structure_only_tag_df = pd.DataFrame({'text': [Structure_only_tag]})
        array_temp = Structure_only_tag_df['text'].apply(
            lambda x: " img ".join(list(map(lambda x: 'text' if len(x) > 3 else '', x.split('<img>')))).strip().replace(
                '  ', ' ')).values
        refined_structure = ''.join(array_temp)

        HTML_preprocessing = {'Text': Text, 'refined_structure': refined_structure,
                              'Count_space_mistake': Count_space_mistake}

        return HTML_preprocessing

    def sentimental_analysis(text):
        pos_word_list = []
        neg_word_list = []
        
        pos_ratio = 0.000000001
        neg_ratio = 0.000000001
        subjectivity = 0.000000001
        polarity = 0.000000001
        senti_diffs_per_ref = 0.000000001

        if text == '':
            sentiment_dict = {'pos_ratio': pos_ratio, 'neg_ratio': neg_ratio, 'subjectivity': subjectivity,
                              'polarity': polarity, 'senti_diffs_per_ref': senti_diffs_per_ref}
            return sentiment_dict, pos_word_list, neg_word_list
        else:
            pos = 0
            neg = 0
            text = text.split(' ')
            n = len(text)
            for i in text:
                i = remove_odd(i)
                pre = kkma.pos(i)
                test = ';'.join(['/'.join(i) for i in pre])
                if test in word_list:
                    if label[word_list.index(test)] == 'POS':
                        pos += 1
                        pos_word_list.append(test)
                    elif label[word_list.index(test)] == 'NEG':
                        neg += 1
                        neg_word_list.append(test)
            try:
                pos_ratio = pos / n
            except:
                pass
            try:
                neg_ratio = neg / n
            except:
                pass
            try:
                subjectivity = (neg + pos) / n
            except:
                pass
            try:
                polarity = (neg - pos) / (neg + pos)
            except:
                pass
            try:
                senti_diffs_per_ref = (pos - neg) / n
            except:
                pass

            sentiment_dict = {'pos_ratio': pos_ratio, 'neg_ratio': neg_ratio, 'subjectivity': subjectivity,
                              'polarity': polarity, 'senti_diffs_per_ref': senti_diffs_per_ref}
            return sentiment_dict, pos_word_list, neg_word_list

    def check_First_second(Text):
        first_person = ['나/NP', '저/NP', '내/NP', '제/NP', '저희/NP', '우리/NP']
        second_person = ['너/NP', '자네/NP', '당신/NP', '그대/NP', '그쪽/NP', '너희/NP', '자기/NP']
        First = 0
        Second = 0
        if Text == '':
            check_First_second_dict = {'First': First, 'Second': Second}
            return check_First_second_dict
        else:
            text = kkma.pos(Text)
            for i in text:
                temp = "/".join(i)
                if temp in first_person:
                    First += len(temp.split('/')[0])
                if temp in second_person:
                    Second += len(temp.split('/')[0])
            check_First_second_dict = {'First_ratio': First/len(Text), 'Second_ratio': Second/len(Text)}
            return check_First_second_dict

In [53]:
def Spacing_text(text_list):
    spacing_list = []
    for i in text_list:
        if len(i) < 197:
            spacing_list.append(spacing(i))
        else:
            iteration = int(len(i) / 197)
            mod = len(i) % 197
            start = 0
            end = 197
            check = 0
            while True:
                # 시행횟수 < 몫
                if check < iteration:
                    spacing_list.append(spacing(i[start:end]))
                    start += 197
                    end += 197
                    check += 1
                else:
                    # 마지막 횟수 + 나머지 더 slice
                    spacing_list.append(spacing(i[iteration * 197:(iteration * 197) + mod]))
                    break
    return spacing_list

def remove_odd(x):
    x = re.sub("nbsp", " ", x)
    x = re.sub("\xa0", "", x)
    x = re.sub("\u200b", "", x)
    x = re.sub("\n", "", x)
    x = re.sub("\t", "", x)
    x = re.sub('   ', ' ', x)
    return x

def tfidf_vectorizer(Text):
    try:
        return v_load.transform([Text]).toarray().flatten()
    except:
        return ''

In [54]:
# ex1 https://blog.naver.com/you-n-mi?Redirect=Log&logNo=221352751222
# ex2 https://blog.naver.com/sky_sea11?Redirect=Log&logNo=221216249472
# ex3 https://blog.naver.com/jhforever48/221154182850
# ex4 https://blog.naver.com/soundbross?Redirect=Log&logNo=221403690848

In [None]:
User_id = ['you-n-mi','sky_sea11','jhforever48','soundbross','senti54','0105114a']
Post_id = ['221352751222','221216249472','221154182850','221403690848','221401308842','221269157353']

In [56]:
from bs4 import BeautifulSoup 
import re
import requests
from selenium import webdriver
from pykospacing import spacing
import pandas as pd
from konlpy.tag import Kkma
import numpy as np
import pickle

Using TensorFlow backend.


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [None]:
text_list = []
title_list = []
for User_id,Post_id in list(zip(User_id, Post_id)):
    url = "http://blog.naver.com/PostView.nhn?blogId=" + User_id + "&logNo=" + Post_id + "&redirect=Dlog&widgetTypeCall=true"
    mobile_url = "http://m.blog.naver.com/PostView.nhn?blogId="+ User_id
    opening_url = 'http://blog.naver.com/profile/intro.nhn?blogId='+ User_id
    structure = textclass.Extract_structure_and_tag(User_id,Post_id)
    all_tag = structure['structure']
    p_img_tag = structure['structure_p_img_tag']
    HTML_preprocessing = textclass.HTML_preprocessing(p_img_tag)
    text = HTML_preprocessing['Text']
    text_list.append(text)

In [None]:
url = "http://blog.naver.com/PostView.nhn?blogId=" + User_id + "&logNo=" + Post_id + "&redirect=Dlog&widgetTypeCall=true"
r = requests.get(url)
bs = BeautifulSoup(re.sub('&nbsp;', ' ', r.text).encode("utf-8"), "html.parser")

# Title

In [None]:
title = bs.find("h3", {"class": "se_textarea"})

# 스마트에디터3 타이틀 제거 임시 적용 (클래스가 다름)
if (title == None):
    title = post.find("span", {"class": "pcol1 itemSubjectBoldfont"})
if (title != None):
    title = title.text.strip()
else:
    title = "TITLE ERROR"

In [None]:
title = title

In [None]:
text = text_list[-1]

In [None]:
#Tfidf_model로 교체
text_vec = vectorizer_Text.transform([text])
title_vec = vectorizer_Title.transform([title])
text_vec = pd.DataFrame(text_vec.toarray())
title_vec = pd.DataFrame(title_vec.toarray())
x = pd.concat([text_vec,title_vec],axis=1)

In [None]:
#prediction class
Category[logreg.predict(x)[0]]

In [57]:
User_id = 'newpark314'
Post_id = '221387605004'

url = "http://blog.naver.com/PostView.nhn?blogId=" + User_id + "&logNo=" + Post_id + "&redirect=Dlog&widgetTypeCall=true"
mobile_url = "http://m.blog.naver.com/PostView.nhn?blogId="+ User_id
opening_url = 'http://blog.naver.com/profile/intro.nhn?blogId='+ User_id
structure = textclass.Extract_structure_and_tag(User_id,Post_id)
all_tag = structure['structure']
p_img_tag = structure['structure_p_img_tag']
HTML_preprocessing = textclass.HTML_preprocessing(p_img_tag)
text = HTML_preprocessing['Text']

In [68]:
vectorizer_Text = pickle.load(open('tfidf_text.pkl', 'rb'))

In [69]:
vectorizer_Text

<15135x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 972880 stored elements in Compressed Sparse Row format>