In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumGothic'
# mpl.font_manager.fontManager.ttflist

from glob import glob
from collections import Counter

import konlpy
from konlpy.tag import Mecab

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def show_table(df):
    print('>>> shape :', df.shape)
    print('>>> number of NA :', df.isna().sum().sum())
    display(df.head(3))
    display(df.tail(3))


data_dir = '/home/jovyan/WIDE_HOME/'

print(f'>>> original ver - pd : 1.0.3  |  np : 1.18.4  |  sns : 0.10.1  |  mpl : 3.3.2  |  konlpy : 0.5.2  |  sklearn : 0.23.1')
print(f'>>>  present ver - pd : {pd.__version__}  |  np : {np.__version__}  |  sns : {sns.__version__}  |  mpl : {mpl.__version__}  |  konlpy : {konlpy.__version__}  |  sklearn : {sklearn.__version__}')

## 데이터 로드 - 114개 적요 데이터

In [None]:
%%time

# 114개 전체 품목적요데이터 로드
memo_df = pd.read_csv(data_dir + '과제2_sam/품목적요데이터_통합_sam.csv')

# 데이터 전처리 - 적요 결측 데이터 제외(2128건)
memo_df = memo_df[memo_df['적요_parsed'].notna()]

# 결과 확인
show_table(memo_df)
print('\n')
memo_df.info()
print()

## 탐색적 분석

In [None]:
# 구매/판매구분 분포
col = '구매/판매구분'
tmp_df = memo_df[col].value_counts()

sns.countplot(data=memo_df, x=col, order=tmp_df.index, palette=['tomato', 'cornflowerblue'])
for idx, val in enumerate(tmp_df):
    plt.text(idx-0.2, 2e5, s=f'{val/10000:.2f}만\n\n({val/len(memo_df)*100:.1f}%)', color='white', weight='bold', size=15)

plt.ylabel('빈도', size=20)
plt.yticks(size=12)
plt.xlabel(None)
plt.xticks(size=15)
plt.title(f'{col} 분포', size=20)

plt.ticklabel_format(style='plain', axis='y')
ax = plt.gca()
ax.yaxis.set_label_coords(-0.2, 0.5)

plt.show()

In [None]:
# 판매구매자_업종분류 분포
for col, tmp_q, x_diff in zip(['판매구매자_업종대분류', '판매구매자_업종대분류 - 중분류', '판매구매자_업종대분류 - 중분류 - 소분류'], [0.9, 0.94, 0.94], [0.3, 0.2, 0.2]):
    top_k = 3
    
    if col == '판매구매자_업종대분류':
        tmp_df = memo_df[col].value_counts()
    else:
        tmp_df = memo_df[col].value_counts().head(15)

    plt.figure(figsize=(30, 6))
    sns.countplot(data=memo_df, x=col, order=tmp_df.index, palette=['tomato']*top_k + ['cornflowerblue']*(len(tmp_df)-top_k))
    for idx, val in enumerate(tmp_df.values):
        if idx < top_k:
            y_val, msg, weight = tmp_df.quantile(tmp_q), f'  {val/10000:.0f}만\n({val/len(memo_df)*100:.1f}%)', 'bold'
        else:
            y_val, msg, weight = tmp_df.quantile(tmp_q-0.05), f'{val/10000:.2f}만\n ({val/len(memo_df)*100:.1f}%)', None
        plt.text(idx-x_diff, y_val, s=msg, size=15, weight=weight)

    plt.grid(axis='x', linestyle=':')
    plt.ylabel('빈도', size=20)
    plt.yticks(size=12)
    plt.xlabel(None)
    plt.xticks(rotation=45, ha='right', size=17, weight='bold')
    plt.title(f'{col} 분포', size=25)

    plt.ticklabel_format(style='plain', axis='y')
#     ax = plt.gca()
#     ax.yaxis.set_label_coords(-0.04, 0.5)
    plt.show()
    
    print('\n\n')

## 탐색적 분석 - 토큰별 빈도 

In [None]:
%%time
tokens_list = ' '.join(memo_df['적요_parsed']).split()
tokens_list[:10]

In [None]:
%%time
# 최빈도 적요 확인
pd.options.display.max_columns = 200  # default 20
txt_count_ser = memo_df['적요'].value_counts(dropna=False)

print('>>> 원본 적요별 빈도 테이블')
display(txt_count_ser.head(200).to_frame().T)
display(txt_count_ser.head(200).apply(lambda x : f'{x/len(memo_df)*100:.3f}%').to_frame().T)

In [None]:
%%time
# 최빈도 토큰 확인
token_count_ser = pd.Series(tokens_list).value_counts(dropna=False)

print('\n>>> 전체 토큰별 빈도 테이블')
display(token_count_ser.head(200).to_frame().T)
display(token_count_ser.head(200).apply(lambda x : f'{x/len(memo_df)*100:.3f}%').to_frame().T)

In [None]:
%%time
# 최빈도 토큰 확인 (길이 1 이상)
long_token_count_ser = token_count_ser[token_count_ser.index.str.len()>1]

print('\n>>> 길이 1 이상인 토큰별 빈도 테이블')
display(long_token_count_ser.head(200).to_frame().T)
display(long_token_count_ser.head(200).apply(lambda x : f'{x/len(memo_df)*100:.3f}%').to_frame().T)

In [None]:
file_list = glob('/home/jovyan/WIDE_HOME/Competition/data/경진대회데이터/품목적요데이터/*.csv')
print([i.split('/')[-1] for i in file_list][:10])

In [None]:
pd.options.display.max_rows=500

tmp_df = pd.read_csv(file_list[61])
slicing_conditions = (tmp_df.적요.str.find('공사')!=-1) | (tmp_df.적요.str.find('수수료')!=-1) | (tmp_df.적요.str.find('매출')!=-1) | (tmp_df.적요.str.find('설치')!=-1) | (tmp_df.적요.str.find('카드')!=-1)
display(tmp_df.loc[slicing_conditions].reset_index(drop=True).loc[:10])
print(tmp_df.shape, tmp_df[slicing_conditions].shape, f'{tmp_df[slicing_conditions].shape[0]/tmp_df.shape[0]*100:.2f}%')
# [tmp_df.판매구매자_업종대분류!='건설업']
# tmp_df[tmp_df.적요.str.find('')!=-1]
# tmp_df[tmp_df.적요.str.endswith('점')].reset_index(drop=True).loc[:10]

In [None]:
tmp_df['구매/판매구분'].value_counts(normalize=True).multiply(100).round(2)

In [None]:
for idx, file in enumerate(file_list):
    if idx%10==0:print()
    tmp_df = pd.read_csv(file)
#     print(tmp_df.shape, tmp_df[tmp_df.적요.str.find('수수료')!=-1].shape)
    print(round(tmp_df[(tmp_df.적요.str.find('공사')!=-1) | (tmp_df.적요.str.find('수수료')!=-1) | (tmp_df.적요.str.find('매출')!=-1) | (tmp_df.적요.str.find('설치')!=-1) | (tmp_df.적요.str.find('카드')!=-1)].shape[0]/tmp_df.shape[0]*100, 3), end='\t\t')

In [None]:
letter_list = ''.join(memo_df['적요'].apply(lambda x : x.replace(' ','')).tolist())
letter_list

In [None]:
print(set(''.join(memo_df['적요'].head(100).apply( lambda x : ''.join(set(x.lower())) ))))

In [None]:
tmp_df[tmp_df['적요'].str.find('왯')!=-1]

In [None]:
# letter_list = set(''.join(tmp_df['적요'].apply( lambda x : ''.join(set(x.lower())) )))
pd.Series(Counter(''.join(tmp_df['적요']))).sort_values(ascending=False).head(100).to_frame().T