##1. Example of calculating the similarity between words using pre-trained wiki word vectors

In [8]:
from gensim.models.keyedvectors import KeyedVectors
corpus=KeyedVectors.load_word2vec_format("wiki.en.vec")
corpus.most_similar("man")




[('woman', 0.6511167287826538),
 ('man—', 0.5554498434066772),
 ('man/one', 0.5516811609268188),
 ('stranger', 0.5505865216255188),
 ('boy', 0.5456836223602295),
 ('woman—a', 0.5385604500770569),
 ('spider', 0.5372462272644043),
 ('girl', 0.5341853499412537),
 ('man—a', 0.5321135520935059),
 ('man—and', 0.5311824083328247)]

In [68]:
type(corpus)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [24]:
corpus.similarity('vegetable','horse')

0.23583984923717163

In [29]:
corpus.similarity('donkey','horse')

0.5242442005632283

##2. Classification of HScode.xlsx file accoring to the size of item categories 

Find a code in the bigger to smaller categories.

In [1]:
import pandas as pd
import numpy as np

hs=pd.read_excel('2016(hsk)_data_1.1.xlsx',sheet_name='HS코드집',converters={'HS코드':str,'코드명(품명)':str})
hs=hs.sort_index()
hs.columns=['HS코드','코드명(품명)']

print(hs.shape) 

hs[:5]

(17966, 2)


Unnamed: 0,HS코드,코드명(품명)
0,101,"Live horses, asses, mules and hinnies."
1,1012,Horses :
2,10121,Pure-bred breeding animals
3,101211000,For farm breeding
4,101219000,Other


In [19]:
arr=np.array(hs)
hscode = arr[:,0] #HS코드
item = arr[:,1] #코드명(품명)

#print(hscode)

type(item[0])


str

In [20]:
unmat=pd.read_excel('2016(hsk)_data_1.1.xlsx',sheet_name='비매칭신고사례',converters={'신고서상의 기재품명':str})
unmat=unmat.sort_index()

arr2=np.array(unmat)
unmat_item=arr[:,1] #신고서상의 기재품명

print(unmat.shape) 

unmat[:5]

(52, 3)


Unnamed: 0,HS코드,신고서상의 기재품명,HS코드명(품명)
0,3921191010,SEPARATOR; RF5041;; 15㎛*66.6㎜,"Separator, for manufacturing secondary battery"
1,8517701029,"Camera Module(1/5”3.7M), CM370RF05SN930F",Other
2,8481801030,PVM VALVE (02-432051-00),Other automatically controlled
3,2104101000,Broths preparations of meat; KS ORGANIC CHICKE...,Of meat
4,1602509000,JACK LINK'S TENDER CUTS ; Traditional flavor ;...,Other


In [3]:
def split_by_n(string,offset):
    return string[:offset]

In [4]:
def getcategory(codelst,offset):
    hold='0'*offset
    itemlist=[]
    isValid=False #(codelst의 상품의 코드길이==offset) 인 상품 개수 > 0 이면 True
        
    #print("코드리스트", codelst)
    for code in codelst:
        if(hold < split_by_n(code,offset) and len(code)>=offset):          
            itemlist.append(item[list(hscode).index(code)])
            hold=split_by_n(code,offset)
        if(len(code)==offset): isValid=True

    #for i in range(0,20):
       # print(itemlist[i])
        
    if not isValid: itemlist=[]    
        
    #print("같은 코드길이의 itemlist",itemlist)    
    return itemlist   

#getcategory(hscode,4)

#print(itemlist) #: 해당 범주의 상품들이 출력된다. 
#ex) 가장 큰 범주의 HS코드번호 0101, 0102, 0103 ...0204, 020500, 0206 ... 의 상품명

##3. Example of calculating the similarity between phrases using nltk and tf-idf

In [5]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
#여러가지 이유로 변화된 단어의 접미사나 어미를 제거하여 같은 의미를 가지는 형태소의 실제 형태를 동일하게 만든다.

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]


print (cosine_sim('a little bird', 'a little bird'))
print (cosine_sim('a little bird', 'a little bird chirps'))
print (cosine_sim('a little bird', 'a big dog barks'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0.9999999999999998
0.7092972666062738
0.0


##4. HS code modification

In [6]:
from pprint import pprint

In [7]:
def subclass(lst,head):
    sublst=[] #소범주의 상품 리스트
    offset=len(head)

    for code in lst:
        if(head==split_by_n(code,offset)):
            sublst.append(code)   
        
    return sublst    

In [17]:
import operator

def dictOfsim(unmat_str):
    final_name=''
    final_code=0
    
    codelst=hscode
    itemlst=[]
    
    for offset in range(4,11):
        print("\n\noffset: ",offset)
        itemlst= getcategory(codelst,offset) #특정 offset 길이의 code를 head로 가지고 있는 itemlist를 얻는다.
        if not itemlst: continue   #if list is empty -> go to the next loop 
        
        sorted_d, d={},{}   
        
        for i in itemlst:
            d[cosine_sim(unmat_str, i)]=i 
            #유사도가 전부 같은 경우 Other로 분류된다

        sorted_d = sorted(d.items(), key=operator.itemgetter(0),reverse=True)        
        pprint(sorted_d)
        
        #final_code=[j for j in codelst for k in itemlst if item[list(hscode).index(j)]==sorted_d[0][1]==itemlst[k]]
        for j in codelst:
            if (item[list(hscode).index(j)]==sorted_d[0][1] and (len(j)==offset or j[offset:]=='0'*(10-offset))):
                final_code=j
                #print("헤드코드",final_code)
                
        final_name=sorted_d[0][1] + ', ' + final_name        
        if(len(final_code)>9):break
            
        sublst = subclass(codelst, final_code)
        codelst=sublst    
        
    final_name=final_name[:-2]
    return final_code,final_name

#dictOfsim('Vegetable Saps and Extract(Ginseng Leaves Extract) ; Acceleris ; CHINA')        

In [61]:
#print(hscode[list(item).index(sorted_d[0][1])]) #가장 유사한 상품의 코드번호

#print(sorted_d[0][1]) #가장 유사한 상품명

#pprint(sorted_d) #list of dictionary, 유사도 내림차순