In [20]:
import nltk
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
class TextSimilarityExample:
    # 생성자 정의
    def __init__(self):
        self.statements = [ #샘플 문장 정의
            'ruled india',
            'Chalukyas ruled Badami',
            'So many kingdoms ruled India',
            'Lalbagh is a bontanical garden in India'
        ]
    #모든 단어의 TF 정의
    def TF(self,sentence):
        words = nltk.word_tokenize(sentence.lower())
        freq = nltk.FreqDist(words)
        dictionary = {}
        for key in freq.keys():
            norm = freq[key]/float(len(words))
            dictionary[key] = norm
        return dictionary
    #문서내 모든 단어에대한  IDF 값을 찾는 IDF 정의
    def IDF(self):
        def idf(TotalNumberOfDocuments,NumberOfDocumentsWithThisWord):
            return 1.0 + math.log(TotalNumberOfDocuments/NumberOfDocumentsWithThisWord)
        numDocuments = len(self.statements)
        uniqueWords = {}
        idfValues = {}
        
        for sentence in self.statements:
            for word in nltk.word_tokenize(sentence.lower()):
                if word not in uniqueWords:
                    uniqueWords[word] =1
                else:
                    uniqueWords[word] +=1
        for word in uniqueWords:
            idfValues[word] = idf(numDocuments,uniqueWords[word])
        return idfValues
    
    #주어진 검색 문자열에 대해 모든 문서에 대한 TF_IDF를 정의
    def TF_IDF(self,query):
        words = nltk.word_tokenize(query.lower())
        idf = self.IDF()
        vectors = {}
        for sentence in self.statements:
            tf = self.TF(sentence)
            for word in words:
                tfv = tf[word] if word in tf else 0.0
                idfv = idf[word] if word in idf else 0.0
                mul = tfv * idfv
                if word not in vectors:
                    vectors[word] =[]
                vectors[word].append(mul)
            return vectors
    #벡터의 내용을 화면에 표시
    def displayVectors(self,vectors):
        print(self.statements)
        for word in vectors:
            print("{} -> {}".format(word,vectors[word]))
    
    #scikit를 사용 -> 코사인 유사도 찾기
    def cosineSimilarity(self):
        vec = TfidfVectorizer()
        matrix = vec.fit_transform(self.statements) #TF-IDF값의 행렬 작성.
        for j in range(1,5):
            i = j -1
            print("\tsimilarity of document {} with others".format(i))
            similarity = cosine_similarity(matrix[i:j],matrix)
            print(similarity)
      
    def demo(self):
        inputQuery = self.statements[0]
        vectors = self.TF_IDF(inputQuery) #TF_IDF사용 => 벡터 생성
        self.displayVectors(vectors) # 화면 출력
        self.cosineSimilarity() #코사인 유사도 출력,
            

In [26]:
similarity = TextSimilarityExample()
similarity.demo()

['ruled india', 'Chalukyas ruled Badami', 'So many kingdoms ruled India', 'Lalbagh is a bontanical garden in India']
ruled -> [0.6438410362258904]
india -> [0.6438410362258904]
	similarity of document 0 with others
[[1.         0.29088811 0.46216171 0.19409143]]
	similarity of document 1 with others
[[0.29088811 1.         0.13443735 0.        ]]
	similarity of document 2 with others
[[0.46216171 0.13443735 1.         0.08970163]]
	similarity of document 3 with others
[[0.19409143 0.         0.08970163 1.        ]]
