In [120]:
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

fileList = {"freeboard":"pickles/freeboard_db.pkl", "humordata":"pickles/humordata_6047_db.pkl", "star":"pickles/star_db.pkl",
           "sisa":"pickles/sisa_db.pkl", "best":"pickles/humorbest_db.pkl", "bestofbest":"pickles/bestofbest_db.pkl"}
dfs = {}
for board in fileList.keys():
    with open(fileList[board],"rb") as f :
        dfs[board] = pd.DataFrame.from_dict(pickle.load(f)).set_index('contentID')



In [116]:
fileList.keys()

dict_keys(['freeboard', 'humordata', 'best', 'bestofbest', 'sisa', 'star'])

In [134]:
df1 = pd.concat([df for df in dfs.values()], axis=0)
df1 = df1[df1['writerVisitingCount'].str.contains('-') == False]
#remove duplicate articles
df = df1.reset_index().drop_duplicates(subset='contentID', keep='first').set_index('contentID')

In [135]:
df1.index

Index(['freeboard_1377351', 'freeboard_1309879', 'freeboard_1331187',
       'freeboard_1255886', 'freeboard_1240142', 'freeboard_1346887',
       'freeboard_1345544', 'freeboard_1256458', 'freeboard_1249492',
       'freeboard_1341092',
       ...
       'star_343818', 'star_359453', 'star_357915', 'star_369784',
       'star_375886', 'star_351841', 'star_384592', 'star_385072',
       'star_388428', 'star_379388'],
      dtype='object', name='contentID', length=47983)

In [136]:
df.index

Index(['freeboard_1377351', 'freeboard_1309879', 'freeboard_1331187',
       'freeboard_1255886', 'freeboard_1240142', 'freeboard_1346887',
       'freeboard_1345544', 'freeboard_1256458', 'freeboard_1249492',
       'freeboard_1341092',
       ...
       'star_377869', 'star_351447', 'star_355807', 'star_359453',
       'star_357915', 'star_369784', 'star_351841', 'star_385072',
       'star_388428', 'star_379388'],
      dtype='object', name='contentID', length=45471)

In [137]:
from datetime import datetime
import operator
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from scipy.stats.stats import pearsonr
from operator import truediv
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

Index = ['contentID', 'writerName', 'writerSignInDate', 'writerVisitingCount',
       'recommendCount', 'viewCount', 'memoCount', 'postTime',
       'normalPostCount', 'bestPostCount', 'BOBPostCount', 'board', 'title',
       'changjakOption', 'permOption', 'prohibitBestOption',
       'prohibitBOBOption', 'prohibitBoninOption', 'prohibitOutsidePermOption',
       'imgCount', 'videoCount', 'youtubeCount', 'textLineCount', 'bestTime',
       'BoBTime', 'okTime', 'memoTime', 'okIn3600s', 'bad', 'good', 'rcmd',
       'memoIn3600s']

# datatype 정리
for col in ['writerVisitingCount', 'viewCount', 'memoCount', 'normalPostCount', 'bestPostCount', 'BOBPostCount']:
    df[col] = pd.to_numeric(df[col]).copy()


######### 새로운 feature 추가
# 게시물 중 베오베 비율
X_BOBratio = list(map(truediv,X_BOBPostCount, X_normalPostCount))
df['BOBratio'] = df.BOBPostCount / df.normalPostCount
#  베오베 label
df['isBOB'] = df.BoBTime.apply(lambda e: True if e != 0 else False)
# 15분 이전의 추천과 댓글
df['okIn900s'] = df.okTime.apply(lambda e : len(list(filter(lambda f: f<=900, e))))
df['memoIn900s'] = df.memoTime.apply(lambda e : len(list(filter(lambda f: f<=900, e))))
# 추천수 비추천수 분리
df['bad'] = df.recommendCount.apply(lambda e: (e.split('/')[1]) if '/' in e else 0)
df['good'] = df.recommendCount.apply(lambda e: (e.split('/')[0]) if '/' in e else e)
df['rcmd'] = df.recommendCount.apply(lambda e: (e.split('/')[0]) if '/' not in e else int(e.split('/')[0]) - int(e.split('/')[1]))
# df.drop('recommendCount', axis=1,inplace=True)
# 시간대별로 나누는 과정 
df['postHour'] = df.postTime.apply(lambda e: e.split(' ')[1].split(':')[0])
df['timeZone'] = df.postHour.apply(lambda e: "morning" if int(e)>=3 and int(e)<9 else "noon" if int(e)>=9 and int(e)<15 else "evening" if int(e)>=15 and int(e)<21 else "night")
df_timeZone = pd.get_dummies(df['timeZone'])
df = pd.concat([df, df_timeZone,], axis=1,)

######## 정규화 시키는 과정 scaler.fit(data).transform(data)로 정규화 
scaler = preprocessing.StandardScaler()

normalizing_cols = ['okIn900s', 'BOBratio', 'imgCount', 'youtubeCount', 'videoCount', 'writerVisitingCount', 'memoIn900s', 
                     'normalPostCount', 'bestPostCount', 'BOBPostCount', 'textLineCount',
                    'viewCount', 'memoCount']
integerize_cols = ['isBOB', 'changjakOption', 'permOption', 'prohibitBestOption', 'prohibitBOBOption', 'prohibitBoninOption', 'prohibitOutsidePermOption']

for col in normalizing_cols:
    df[col] =  scaler.fit(df[col]).transform(df[col])
    
for col in integerize_cols:
    df[col] = df[col].apply(lambda e : 0 if e==False else 1)
# df['okIn900s_normalized'] = scaler.fit(df['okIn900s']).transform(df['okIn900s'])
# df['BOBratio_normalized'] = scaler.fit(df['BOBratio']).transform('BOBratio')
# df['imgCount_normalized'] = scaler.fit(df['imgCount']).transform(df['imgCount'])
# df['youtubeCount_normalized'] = scaler.fit(df['youtubeCount']).transform(df['youtubeCount'])
# df['videoCount_normalized'] = scaler.fit(df['videoCount']).transform(df['videoCount'])
# df['writerVisitingCount_normalized'] = scaler.fit(df['writerVisitingCount']).transform(df['writerVisitingCount'])
# df['memoIn900s_normalized'] = scaler.fit(df['memoIn900s']).transform(df['memoIn900s'])
# df['okIn900s_normalized'] = scaler.fit(df['okIn900s']).transform(df['okIn900s'])
# df['imgCount_normalized'] = scaler.fit(df['imgCount']).transform(df['imgCount'])
# df['youtubeCount_normalized'] = scaler.fit(df['youtubeCount']).transform(df['youtubeCount'])
# df['videoCount_normalized'] = scaler.fit(df['videoCount']).transform(df['videoCount'])
# df['normalPostCount_normalized'] = scaler.fit(df['normalPostCount']).transform(df['normalPostCount'])
# df['bestPostCount_normalized'] = scaler.fit(df['bestPostCount']).transform(df['bestPostCount'])
# df['BOBPostCount_normalized'] = scaler.fit(df['BOBPostCount']).transform(df['BOBPostCount'])
# df['textLineCount_normalized'] = scaler.fit(df['textLineCount']).transform(df['textLineCount'])

# df['isBOB'] = df.BoBTime.apply(lambda e: True if e != 0 else False)
# df['changjakOption_integered'] = df['changjakOption'].apply(lambda e : -1 if e==False else 1)
# df['permOption_integered'] = df['permOption'].apply(lambda e : -1 if e==False else 1)
# df['prohibitBestOption_integered'] = df['prohibitBestOption'].apply(lambda e : -1 if e==False else 1)
# df['prohibitBOBOption_integered'] = df['prohibitBOBOption'].apply(lambda e : -1 if e==False else 1)
# df['prohibitBoninOption_integered'] = df['prohibitBoninOption'].apply(lambda e : -1 if e==False else 1)
# df['prohibitOutsidePermOption_integered'] = df['prohibitOutsidePermOption'].apply(lambda e : -1 if e==False else 1)
# df['changjakOption_integered'] = df['changjakOption'].apply(lambda e : 0 if e==False else 1)
# df['permOption'] = df['permOption'].apply(lambda e : 0 if e==False else 1)
# df['prohibitBestOption'] = df['prohibitBestOption'].apply(lambda e : 0 if e==False else 1)
# df['prohibitBOBOption'] = df['prohibitBOBOption'].apply(lambda e : 0 if e==False else 1)
# df['prohibitBoninOption'] = df['prohibitBoninOption'].apply(lambda e : 0 if e==False else 1)
# df['prohibitOutsidePermOption'] = df['prohibitOutsidePermOption'].apply(lambda e : 0 if e==False else 1)
# df['isBOB'] = df.BoBTime.apply(lambda e: True if e != 0 else False)

In [138]:
df = df.loc[:,~df.columns.duplicated()]
df.dtypes

writerName                    object
writerSignInDate              object
writerVisitingCount          float64
recommendCount                object
viewCount                    float64
memoCount                    float64
postTime                      object
normalPostCount              float64
bestPostCount                float64
BOBPostCount                 float64
board                         object
title                         object
changjakOption                 int64
permOption                     int64
prohibitBestOption             int64
prohibitBOBOption              int64
prohibitBoninOption            int64
prohibitOutsidePermOption      int64
imgCount                     float64
videoCount                   float64
youtubeCount                 float64
textLineCount                float64
bestTime                      object
BoBTime                       object
okTime                        object
memoTime                      object
BOBratio                     float64
i

In [139]:
df.index

Index(['freeboard_1377351', 'freeboard_1309879', 'freeboard_1331187',
       'freeboard_1255886', 'freeboard_1240142', 'freeboard_1346887',
       'freeboard_1345544', 'freeboard_1256458', 'freeboard_1249492',
       'freeboard_1341092',
       ...
       'star_377869', 'star_351447', 'star_355807', 'star_359453',
       'star_357915', 'star_369784', 'star_351841', 'star_385072',
       'star_388428', 'star_379388'],
      dtype='object', name='contentID', length=45471)

In [140]:
df.to_pickle('df_test.pkl')