# 태그추천 매트릭스 생성 파일 호출

In [1]:
# 패키지 호출
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


## 태그 전처리(지역정보제거)
def location(cat):
  for i in range(len(cat)):
    try:
      cat['keyword'][i] = str(cat['keyword'][i]).replace('대전','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('서울','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('대구','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('울산','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('부산','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('경기도','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('수지','')
      cat['keyword'][i] = str(cat['keyword'][i]).replace('NaN','')
    except: pass
  return(cat)

## 데이터 merge
def merge_df(df):
    df.fillna(0,inplace=True)
    df.replace(0,'',inplace=True)
    df['merge'] = df['title'] + ' '+df['cat1'] #+ df['cat2'] + df['cat3']
    df = df[['merge','keyword']]
    df.rename(columns={'merge':'feature', 'keyword' : 'label'}, inplace=True)
    return df

## 정규표현식 전처리(카테고리별 따로 지정)
def proccessing(df):
  for i in range(len(df)):
    ## 노트북 한정
    if '인치' in df.feature[i]:
      df.feature[i] = re.sub('\W+',' ', df.feature[i].lower())
    else : 
      df.feature[i] = re.sub('\W+',' ', df.feature[i].lower())
      df.feature[i] = re.sub('[0-9]',' ', df.feature[i].lower())

  # 한글자 처리(상황따라 변동)
  for info in range(len(df)):
    title = df.feature[info].split(' ')
    temp =''
    for i in title: 
      if len(i) > 1:
        temp = temp + i + ' '
    df.feature[info] = temp
  return(df)


## 태그만 모아두는 리스트 생성 및 10번이상 노출된 태그 list 추출
def tag(cat):
  test = list(cat['label'])
  label_unique=''
  for i in range(len(test)):
    try : 
      label_unique += test[i]
    except : 
      pass
  label_unique = label_unique.split(',')
  label_unique = [v for v in label_unique if v]
  
  df = pd.DataFrame({'label_unique':label_unique, 'count':0}).groupby(['label_unique']).count().reset_index()
  df = df[df['count']>10].sort_values(by='count',ascending=False).reset_index()[['label_unique','count']]
  top = list(df['label_unique'])

  return(top)

## 태그 분배 함수
def split_label(cat):
  for i in range(1,6):
    cat[f'label{i}'] = cat.label.str.split(',').str[i]
  cat = cat.fillna(0)
  return(cat)  


## 문서 유사도 계산 함수
def tfidf(proccessed_cat):
  vect = CountVectorizer()
  countvect = vect.fit_transform(proccessed_cat)

  countvect_df = pd.DataFrame(countvect.toarray(), columns = sorted(vect.vocabulary_))
  
  vect = TfidfVectorizer(max_features = 10000)
  tfvect = vect.fit(proccessed_cat)

  tfidv_df = pd.DataFrame(tfvect.transform(proccessed_cat).toarray(), columns = sorted(vect.vocabulary_))
  return(tfidv_df)

## 태그별 문서 유사도 계산
def make_matrix(tag_,cat):
  total_lab = pd.DataFrame()
  for label in tqdm(tag_):
    temp = pd.DataFrame()
    for i in range(len(cat)):
        if cat['label1'][i] == label:
            temp = pd.concat([temp, cat.iloc[[i]]])
        elif cat['label2'][i] == label:
            temp = pd.concat([temp, cat.iloc[[i]]])
        elif cat['label3'][i] == label:
            temp = pd.concat([temp, cat.iloc[[i]]])
        elif cat['label4'][i] == label:
            temp = pd.concat([temp, cat.iloc[[i]]])
        elif cat['label5'][i] == label:
            temp = pd.concat([temp, cat.iloc[[i]]])
    try:
      temp_tfidf = tfidf(temp['feature'])
      temp_tfidf['label'] = label
      temp_tfidf = temp_tfidf.groupby('label').mean().reset_index()
      total_lab = pd.concat([total_lab,temp_tfidf])
    except: pass
  total_lab = total_lab.fillna(0)

  return(total_lab)

## 입력한 제목 전처리 및 태그출력
# title = [input('제목을 입력해주세요 : ')]
def find_tag(title, cat_matrix):
  # test = proccessed_input(list(title))
  test = title[0].split(' ')
  for i, j in enumerate(test):
    i = re.sub('\W+',' ', j)
  test_input = []
  for i in test:
    if i in cat_matrix.columns:
      test_input.append(i)
  test_input.append('label')
  # 해당 카테고리에 적용
  test_matrix = cat_matrix[test_input]
  test_matrix
  # 상위 top5 태그 추출
  test_matrix['target'] = 0
  for i in range(len(test_input)-1):
    test_matrix['target'] += test_matrix[test_input[i]] 
  test_matrix['target'] = test_matrix['target'] / (len(test_input)-1)
  test_matrix = test_matrix[['label','target']].sort_values(by='target', ascending=False)[:5]
  return(test_matrix)

# !!cd /Users/ppangppang/Desktop/ssac/Final_project/3.Recommen_model/
# %run tag_recommend.py
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# 평가지표 생성을 위한 DF 생성

In [4]:
# 작업 목록 df로 불러오기
query_list = ['갤럭시','노트북','마우스','아이폰','에어팟','키보드']

total_df = []
for i in range(len(query_list)):
  # 경로지정
  total_df.append(pd.read_csv(f'/Users/ppangppang/Desktop/ssac/Final_project/2.Processed/info/{query_list[i]}.csv'))

## 기본적인 전처리 및 모델 생성을 위한 DF 생성

In [41]:
# 카테고리 상태에 따라 cat1, cat2, cat3 미리 좀 거르기
lab = total_df[1]
lab = lab[(lab['cat1']=='디지털/가전')]
lab = lab[(lab['cat2']=='PC/노트북')]
lab = lab[(lab['cat3']=='노트북/넷북') | (lab['cat3']=='기타(PC/노트북)') | (lab['cat3']=='0')]
lab

Unnamed: 0,product_id,title,keyword,cat1,cat2,cat3,view,image
0,174961627,엘지울트라북 (인텔10세대 ),"울트라북,엘지노트북,15u50n,인강용노트북,사무용노트북",디지털/가전,PC/노트북,노트북/넷북,68,https://media.bunjang.co.kr/product/174961627_...
1,171378783,마지막 가격인하 한성컴퓨터 게이밍랩탑 FH57bossmonster,,디지털/가전,PC/노트북,노트북/넷북,272,https://media.bunjang.co.kr/product/171378783_...
2,174969894,삼성 노트북,,디지털/가전,PC/노트북,노트북/넷북,35,https://media.bunjang.co.kr/product/174969894_...
3,174969935,삼성 노트북,,디지털/가전,PC/노트북,노트북/넷북,56,https://media.bunjang.co.kr/product/174969935_...
4,174587767,맥북 에어 13인치,"맥북,맥북에어",디지털/가전,PC/노트북,노트북/넷북,107,https://media.bunjang.co.kr/product/174587767_...
...,...,...,...,...,...,...,...,...
9995,146857634,노트북 삼성 i5-2410M SSD교체 램8GB GT520M,"노트북,삼성,i5,SSD",디지털/가전,PC/노트북,노트북/넷북,408,https://media.bunjang.co.kr/product/146857634_...
9996,170569713,[미개봉] 갤럭시북PRO 360 / 16GB / nvme 1TB 팝니다,갤럭시북360,디지털/가전,PC/노트북,노트북/넷북,118,https://media.bunjang.co.kr/product/170569713_...
9997,172769065,맥북 에어 i3 기본형 골드 판매합니다,,디지털/가전,PC/노트북,노트북/넷북,31,https://media.bunjang.co.kr/product/172769065_...
9998,172768339,맥북에어 2020 13인치 256기가 기스없음 매직마우스 포함,"맥북,맥북에어2020,#파름신오신날,광주,광주노트북",디지털/가전,PC/노트북,노트북/넷북,35,https://media.bunjang.co.kr/product/172768339_...


In [42]:
lab = location(lab) # 지역정보 걸러야 할 경우만 시행
lab = merge_df(lab) # feature, label로 만들기
lab = lab[(lab.label != 'nan') &(lab.label != '')] # label 빈거 지우기
lab.index=range(len(lab))
lab = proccessing(lab) # 특수문자, 한글자 전처리
lab

Unnamed: 0,feature,label
0,엘지울트라북 인텔 세대 디지털 가전,"울트라북,엘지노트북,15u50n,인강용노트북,사무용노트북"
1,맥북 에어 13인치 디지털 가전,"맥북,맥북에어"
2,엘지노트북 세대 무게 kg 디지털 가전,"파름신오신날,엘지노트북,울트라북,13UD50N,사무용노트북"
3,최저가 인기상품 삼성아티브북 핑크 천송이노트북 핑크색상 문의폭주 디지털 가전,"천송이노트북,별에서온그대,아티브북9,삼성,노트북"
4,맥북에어 스그 풀박스 상태 좋아요 디지털 가전,"맥북,맥북에어,노트북,애플"
...,...,...
7071,노트북 lenovo ssd교체 디지털 가전,"노트북,SSD"
7072,노트북 삼성 ssd교체 gb gt 디지털 가전,"노트북,삼성,i5,SSD"
7073,미개봉 갤럭시북pro gb nvme tb 팝니다 디지털 가전,갤럭시북360
7074,맥북에어 2020 13인치 256기가 기스없음 매직마우스 포함 디지털 가전,"맥북,맥북에어2020,#파름신오신날,광주,광주노트북"


## Train,Test DF생성 및 라벨 분류

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(lab['feature'],lab['label'], test_size=0.2,shuffle=True, random_state=42)

In [30]:
lab_train = pd.DataFrame()
lab_train['feature'] = X_train
lab_train['label'] = y_train
lab_train.index=range(len(lab_train))

lab_test = pd.DataFrame()
lab_test['feature'] = X_test
lab_test['label'] = y_test
lab_test.index=range(len(lab_test))

In [31]:
lab_train.head(3)

Unnamed: 0,feature,label
0,주연테크 아이언 울트라북 디지털 가전,"울트라북,주연테크,노트북"
1,맥북에어 13인치 2010년 ssd128g 2g 지포스 중고노트북 디지털 가전,"맥북,맥북에어,깔끔한중고,깨끗한중고,가성비노트북"
2,tg 노트북 삼성ssd 디지털 가전,"그램,삼성,lg"


In [32]:
lab_test.head(3)

Unnamed: 0,feature,label
0,맥북프로 13인치 2019 판매합니다 디지털 가전,맥북프로
1,삼성notebook 디지털 가전,"삼성,노트북,laptop,notebook"
2,삼성 15인치 ssd128gb 교체완료된 노트북 nt700z4a s58 디지털 가전,"삼성노트북,노트북,15인치노트북,15인치,ssd"


# Train으로 태그 추천 모델 생성

In [12]:
# 모델링
lab_train = split_label(lab_train)
tag_list = tag(lab_train)
lab_matrix = make_matrix(tag_list,lab_train)
lab_matrix

  0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0,label,10,100만,10세대,10프로,11,11세대,11인치,12,120g,...,idea,pad,새제품입니다,am,fn,ftc,ia,um,hdd1t,한성노트북fhd
0,노트북,0.00036,0.000331,0.001690,0.000351,0.000780,0.001171,0.000941,0.001547,0.000661,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,중고노트북,0.00000,0.000000,0.001075,0.000000,0.000000,0.001667,0.000000,0.005718,0.001939,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,삼성노트북,0.00000,0.000000,0.000969,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,맥북프로,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,맥북,0.00000,0.000000,0.000000,0.000000,0.002284,0.000000,0.003538,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,레노버씽크패드,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,레노버아이디어패드,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.033635,0.033635,0.033635,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,맥북에어13인치,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
0,젠북,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.022671,0.04483,0.073851,0.022671,0.022671,0.000000,0.000000


## 관련 없는 feature list 정리해서 train의 feature 전처리

In [266]:
# 필요없는 정보 담아서 지우기
# title_list =[]
# for i in title_list:
#     lab_train['feature'] = lab_train['feature'].str.replace(i, '')

## 모델 재성성

In [None]:
# lab_matrix = make_matrix(tag_list,lab_train)
# lab_matrix

# Test-set 처리

## test 태그 predict

In [14]:
lab_test['pred']=''
for i in range(len(lab_test)):
    lab_test['pred'][i] = ', '.join(find_tag(lab_test.feature.iloc[i].split(' '), lab_matrix).label)
lab_test

Unnamed: 0,feature,label,pred
0,lg게이밍 노트북 pa hk 디지털 가전,"게이밍,노트북 15u78","게임, lg, 엘지노트북, 게이밍, 노트북매입"
1,맥북 프로 기본형 실버 색상 s급 애케플 판매 구성품 많음 디지털 가전,"맥북프로,맥북프로m1","맥북프로2015, 에어, 맥북에어2020, 맥북m1, 맥북에어m1"
2,gr hk lg gram 저렴하게 판매합니다 디지털 가전,"엘지그램,그램노트북,노트북,lg그램,엘지노트북","아수스노트북, LG그램, 엘지그램, lg그램, lg노트북"
3,lg 키패드글자판 부분교체용 터치패드 디지털 가전,"엘지노트북,14u36,키패드,터치패드","gram, LG, lg, LG노트북, lg노트북"
4,맥북 삽니다 디지털 가전,"맥북,#애플브랜덕데이","맥북프로2015, 에어, 맥북에어2020, 맥북m1, 맥북에어m1"
...,...,...,...
1411,노트북 삼성 ssd교체 gb gt 디지털 가전,"노트북,삼성,i5,SSD","맥북노트북, 램8, 고성능, 업무용, 동영상"
1412,미개봉 갤럭시북pro gb nvme tb 팝니다 디지털 가전,갤럭시북360,"gram, 미개봉, 갤럭시북프로360, 사무, 갤럭시북프로"
1413,맥북 에어 기본형 골드 판매합니다 디지털 가전,,"맥북프로2015, 에어, 맥북에어2020, 맥북m1, 맥북에어m1"
1414,맥북에어 2020 13인치 256기가 기스없음 매직마우스 포함 디지털 가전,"맥북,맥북에어2020,#파름신오신날,광주,광주노트북","맥북노트북, 맥북에어13인치, 에어, m1맥북, 맥북에어m1"


## f1-score 계산

In [15]:
lab_test['TP'] = ''
for idx in range(len(lab_test)):
    count = 0
    for i in lab_test.label.iloc[idx].split(','):
        for j in lab_test.pred.iloc[idx].split(','):
            if i in j:
                count +=1
                break
            elif j in i:
                count +=1
                break
    lab_test['TP'][idx] = count
lab_test

Unnamed: 0,feature,label,pred,TP
0,lg게이밍 노트북 pa hk 디지털 가전,"게이밍,노트북 15u78","게임, lg, 엘지노트북, 게이밍, 노트북매입",1
1,맥북 프로 기본형 실버 색상 s급 애케플 판매 구성품 많음 디지털 가전,"맥북프로,맥북프로m1","맥북프로2015, 에어, 맥북에어2020, 맥북m1, 맥북에어m1",1
2,gr hk lg gram 저렴하게 판매합니다 디지털 가전,"엘지그램,그램노트북,노트북,lg그램,엘지노트북","아수스노트북, LG그램, 엘지그램, lg그램, lg노트북",3
3,lg 키패드글자판 부분교체용 터치패드 디지털 가전,"엘지노트북,14u36,키패드,터치패드","gram, LG, lg, LG노트북, lg노트북",0
4,맥북 삽니다 디지털 가전,"맥북,#애플브랜덕데이","맥북프로2015, 에어, 맥북에어2020, 맥북m1, 맥북에어m1",1
...,...,...,...,...
1411,노트북 삼성 ssd교체 gb gt 디지털 가전,"노트북,삼성,i5,SSD","맥북노트북, 램8, 고성능, 업무용, 동영상",1
1412,미개봉 갤럭시북pro gb nvme tb 팝니다 디지털 가전,갤럭시북360,"gram, 미개봉, 갤럭시북프로360, 사무, 갤럭시북프로",0
1413,맥북 에어 기본형 골드 판매합니다 디지털 가전,,"맥북프로2015, 에어, 맥북에어2020, 맥북m1, 맥북에어m1",1
1414,맥북에어 2020 13인치 256기가 기스없음 매직마우스 포함 디지털 가전,"맥북,맥북에어2020,#파름신오신날,광주,광주노트북","맥북노트북, 맥북에어13인치, 에어, m1맥북, 맥북에어m1",1


In [16]:
# tp값 분포(계산 잘 됐나 확인)
lab_test.TP.value_counts()

1    544
0    356
2    317
3    155
4     38
5      6
Name: TP, dtype: int64

In [20]:
# 전체 tag 개수 파악
full_tag = 0
for i in range(len(lab_test)):
    full_tag += len(lab_test.label.iloc[i].split(','))

tp = lab_test.TP.sum()
fn = full_tag-tp
fp = len(lab_test)*5 - tp

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = round(2*(precision*recall)/(precision+recall),2)

print('precision : ',precision)
print('recall : ',recall)
print('F1_score : ',f1_score)

precision :  0.2577683615819209
recall :  0.3598186119873817
F1_score :  0.3


# 모델 저장

In [21]:
lab_matrix.to_csv('/Users/ppangppang/Desktop/ssac/Final_project/3.Recommen_model/tag_model/lab_model.csv',index=False)

# 실제 값 넣어서 확인해보기
- 아래 식만 사용하면 됨

In [37]:
# 패키지 호출
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re


## 입력한 제목 전처리 및 태그출력
# title = [input('제목을 입력해주세요 : ')]
def find_tag(title, cat_matrix):
    test = title[0].split(' ')
    for i, j in enumerate(test):
        i = re.sub('\W+',' ', j)
    test_input = []
    for i in test:
        if i in cat_matrix.columns:
            test_input.append(i)
    test_input.append('label')
    # 해당 카테고리에 적용
    test_matrix = cat_matrix[test_input]
    test_matrix
    # 상위 top5 태그 추출
    test_matrix['target'] = 0
    for i in range(len(test_input)-1):
        test_matrix['target'] += test_matrix[test_input[i]] 
    test_matrix['target'] = test_matrix['target'] / (len(test_input)-1)
    test_matrix = test_matrix[['label','target']].sort_values(by='target', ascending=False)[:5]
    return(test_matrix)

In [39]:
lab_matrix = pd.read_csv('/Users/ppangppang/Desktop/ssac/Final_project/3.Recommen_model/tag_model/lab_model.csv')

In [40]:
title = [input('제목을 입력해주세요 : ')]
find_tag(title, lab_matrix)

제목을 입력해주세요 : lg gram 17 팝니다


Unnamed: 0,label,target
52,gram,0.061612
29,LG,0.054122
22,lg노트북,0.051819
37,lg,0.050771
128,동영상,0.049198
