In [1]:
import numpy as np
import pandas as pd
import re
from nltk import sent_tokenize
import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 1) CounterVectorizer 을 사용하였을때, doc['text']의 500번째 문서와 가장 유사한 문서 추출 (Cosine Similarity 기반)
## 2) TfidfVectorizer을 사용하였을때, doc['text']의 500번째 문서와 가장 유사한 문서 추출 (Cosine Similarity 기반)

In [3]:
doc = pd.read_csv('./tfidf_dataset.csv')

In [4]:
doc

Unnamed: 0,text,category
0,musicians tackle red tape musicians groups tac...,entertainment
1,u2 desire number u2 won prestigious grammy awa...,entertainment
2,rocker doherty stage fight rock singer pete do...,entertainment
3,snicket tops box office chart film adaptation ...,entertainment
4,ocean raids box office ocean crime caper seque...,entertainment
...,...,...
2220,norway upholds napster ruling norwegian studen...,tech
2221,warning windows word files writing microsoft w...,tech
2222,fast lifts record books high speed lifts world...,tech
2223,nintendo adds media playing ds nintendo releas...,tech


In [5]:
def find_most_similar_doc(index, bow, corpus):
    idx = (-cosine_similarity(bow[index], bow)[0]).argsort()[1]
    return corpus[idx]

## Text Cleaning

In [6]:
def clean_text(text):
    
    pattern = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
    text = re.sub(pattern = pattern,repl = ' ',string = text) #한글삭제(인코딩 오류인지,,, 짙이런 단어가 생기길래 삭제,,)
    
    pattern = '(\[a-zA-Z0-9\_.+-\]+@\[a-zA-Z0-9]+.\[a-zA-Z0-9-.\]+)' # email제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = re.compile(r'([^\w]?\d+\.?\,?\)?\d*)+') # 숫자 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = '<[^>]*>' # html 태그 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = '[\r|\n]' # \r,\n 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern =  '[^\w\s]' # 특수기호 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = re.compile(r'\s+')  #  이중 space 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)

    text = text.lower()
    return text

In [7]:
data = doc['text'].apply(clean_text)
data

0       musicians tackle red tape musicians groups tac...
1       u desire number u won prestigious grammy award...
2       rocker doherty stage fight rock singer pete do...
3       snicket tops box office chart film adaptation ...
4       ocean raids box office ocean crime caper seque...
                              ...                        
2220    norway upholds napster ruling norwegian studen...
2222    fast lifts record books high speed lifts world...
2223    nintendo adds media playing ds nintendo releas...
2224    fast moving phone viruses appear security firm...
Name: text, Length: 2225, dtype: object

## Tokenizing

In [8]:
word_tokenize_result = data.apply(word_tokenize)
word_tokenize_result

0       [musicians, tackle, red, tape, musicians, grou...
1       [u, desire, number, u, won, prestigious, gramm...
2       [rocker, doherty, stage, fight, rock, singer, ...
3       [snicket, tops, box, office, chart, film, adap...
4       [ocean, raids, box, office, ocean, crime, cape...
                              ...                        
2220    [norway, upholds, napster, ruling, norwegian, ...
2222    [fast, lifts, record, books, high, speed, lift...
2223    [nintendo, adds, media, playing, ds, nintendo,...
2224    [fast, moving, phone, viruses, appear, securit...
Name: text, Length: 2225, dtype: object

## POS tagging

In [9]:
pos_result = word_tokenize_result.apply(nltk.pos_tag)
print(pos_result)

0       [(musicians, NNS), (tackle, VBP), (red, JJ), (...
1       [(u, JJ), (desire, NN), (number, NN), (u, JJ),...
2       [(rocker, NN), (doherty, NN), (stage, NN), (fi...
3       [(snicket, NN), (tops, NNS), (box, JJ), (offic...
4       [(ocean, JJ), (raids, NNS), (box, NN), (office...
                              ...                        
2220    [(norway, RB), (upholds, JJ), (napster, RB), (...
2222    [(fast, JJ), (lifts, NN), (record, NN), (books...
2223    [(nintendo, NN), (adds, VBZ), (media, NNS), (p...
2224    [(fast, RB), (moving, VBG), (phone, NN), (viru...
Name: text, Length: 2225, dtype: object


* 명사만 추출하기

In [10]:
def only_nouns(token_pos):
    nouns = []
    for word, pos in token_pos:
        if 'NN' in pos:
            nouns.append(word)
    return nouns

In [11]:
nouns = pos_result.apply(only_nouns)
print(nouns)

0       [musicians, tape, musicians, groups, regulatio...
1       [desire, number, grammy, awards, status, world...
2       [rocker, doherty, stage, fight, rock, singer, ...
3       [snicket, tops, office, chart, film, adaptatio...
4       [raids, box, office, ocean, crime, caper, sequ...
                              ...                        
2220    [student, mp, files, compensation, country, co...
2221    [windows, word, word, document, business, docu...
2222    [lifts, record, books, speed, lifts, world, bu...
2223    [nintendo, media, ds, handheld, play, music, v...
2224    [phone, viruses, security, firms, phone, virus...
Name: text, Length: 2225, dtype: object


## 리스트에 문서 넣기

In [12]:
total_docs = []
for i in range(len(nouns)):
    total_docs.append(' '.join(nouns[i]))

## 1) CounterVectorizer 사용

In [13]:
vectorizer = CountVectorizer(min_df=1,stop_words = 'english') #불용어 제거해줌

In [14]:
bow = vectorizer.fit_transform(total_docs) 

In [15]:
print('★CounterVectorizer 을 사용하였을때, doc[''text'']의 500번째 문서와 가장 유사한 문서★\n')
print(find_most_similar_doc(500, bow, doc['text']))

★CounterVectorizer 을 사용하였을때, doc[text]의 500번째 문서와 가장 유사한 문서★

germany nears 1990 jobless level german unemployment rose 11th consecutive month december making average jobless total highest reunification seasonally adjusted jobless total rose higher expected 17 4 483 million bundesbank allowing changes calculating statistics average people work highest 1990 rate bad weather sluggish economy blamed rise increase primarily onstart winter labour office chief frank juergen weise unadjusted figures showed unemployment rose 206 900 4 64 million sectors construction laying workers amid bad weather years stagnation german economy came 2004 upturn strong boost labour market weise added news rise came welfare reforms came force expected unemployment swell coming months hartz iv changes previous tier system benefits support long term unemployed replaced flat rate payout turn means people classified looking work driving official figures higher prepared nasty figure january million unemployed non se

## TF-IDF

In [16]:
tfidfvect = TfidfVectorizer(min_df=1, stop_words = 'english')#불용어 제거해줌

In [17]:
tfidf_m = tfidfvect.fit_transform(total_docs)

In [18]:
print('★TfidfVectorizer 을 사용하였을때, doc[''text'']의 500번째 문서와 가장 유사한 문서★\n')
print(find_most_similar_doc(500, tfidf_m, doc['text']))

★TfidfVectorizer 을 사용하였을때, doc[text]의 500번째 문서와 가장 유사한 문서★

brazil jobless rate hits low brazil unemployment rate fell lowest level years december according government brazilian institute geography statistics ibge fell 9 6 december 10 6 november 10 9 december 2003 ibge average monthly salaries grew 1 9 december 2004 december 2003 average monthly wages fell 1 december 895 4 reais 332 179 3 november tuesday represent unemployment rate fallen single digit measurement rules introduced 2001 unemployment rate falling gradually april 2004 reached peak 13 1 jobless rate average 2004 11 5 3 2003 ibge improvement attributed country economic growth economy registering growth 5 2004 government economy grow 4 president luiz inacio lula da silva promised reduce unemployment elected years ago analysts unemployment increase data favourable jobs temporary christmas holiday season slightly higher joblessness february julio hegedus economist lopes filho associates consultancy rio de janeir reuters agency