In [1]:
import pandas as pd
from sklearn import preprocessing

class2 = pd.read_csv("../data/nlp/class2.csv")

label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

train_x = label_encoder.fit_transform(class2['class2'])

train_x

array([2, 2, 1, 0, 1, 0])

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "This is last chance.",
    "and if you do not have this change.",
    "you will never get any change.",
    "will you do get this one?",
    "please, get this change"
]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 14,
 'is': 8,
 'last': 9,
 'chance': 2,
 'and': 0,
 'if': 7,
 'you': 16,
 'do': 4,
 'not': 11,
 'have': 6,
 'change': 3,
 'will': 15,
 'never': 10,
 'get': 5,
 'any': 1,
 'one': 12,
 'please': 13}

In [4]:
vect.transform(["you will never get any change."]).toarray()

array([[0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [5]:
vect = CountVectorizer(stop_words=["and", "or", "please", "this"]).fit(corpus)
vect.vocabulary_

{'is': 7,
 'last': 8,
 'chance': 1,
 'if': 6,
 'you': 13,
 'do': 3,
 'not': 10,
 'have': 5,
 'change': 2,
 'will': 12,
 'never': 9,
 'get': 4,
 'any': 0,
 'one': 11}

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

doc = [
    "I like machine learning",
    "I love deep learning",
    "I run everyday"
]

t_vect = TfidfVectorizer(min_df=1)
t_matrix = t_vect.fit_transform(doc)

doc_distance = (t_matrix * t_matrix.T)
print("유사도를 위한", str(doc_distance.get_shape()[0]), "x", str(doc_distance.get_shape()[1]), "행렬을 만들었습니다.")
doc_distance.toarray()

유사도를 위한 3 x 3 행렬을 만들었습니다.


array([[1.      , 0.224325, 0.      ],
       [0.224325, 1.      , 0.      ],
       [0.      , 0.      , 1.      ]])

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim.models import Word2Vec

import warnings

warnings.filterwarnings(action='ignore')

sample = open("../data/nlp/peter.txt", "r", encoding="utf-8")
s = sample.read()

f = s.replace("\n", " ")
data = []

for i in sent_tokenize(f):
    tmp = []
    for j in word_tokenize(i):
        tmp.append(j.lower())
    data.append(tmp)

data

[['once',
  'upon',
  'a',
  'time',
  'in',
  'london',
  ',',
  'the',
  'darlings',
  'went',
  'out',
  'to',
  'a',
  'dinner',
  'party',
  'leaving',
  'their',
  'three',
  'children',
  'wendy',
  ',',
  'jhon',
  ',',
  'and',
  'michael',
  'at',
  'home',
  '.'],
 ['after',
  'wendy',
  'had',
  'tucked',
  'her',
  'younger',
  'brothers',
  'jhon',
  'and',
  'michael',
  'to',
  'bed',
  ',',
  'she',
  'went',
  'to',
  'read',
  'a',
  'book',
  '.'],
 ['she', 'heard', 'a', 'boy', 'sobbing', 'outside', 'her', 'window', '.'],
 ['he', 'was', 'flying', '.'],
 ['there', 'was', 'little', 'fairy', 'fluttering', 'around', 'him', '.'],
 ['wendy', 'opened', 'the', 'window', 'to', 'talk', 'to', 'him', '.'],
 ['“', 'hello', '!'],
 ['who', 'are', 'you', '?'],
 ['why', 'are', 'you', 'crying', '”', ',', 'wendy', 'asked', 'him', '.'],
 ['“', 'my', 'name', 'is', 'peter', 'pan', '.'],
 ['my',
  'shadow',
  'wouldn',
  '’',
  't',
  'stock',
  'to',
  'me.',
  '”',
  ',',
  'he',
  'rep

In [7]:
model1 = Word2Vec(data, min_count=1, vector_size=100, window=5, sg=0)

print("Cosine similarity between 'peter' 'wendy' - CBOW : ", model1.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - CBOW :  0.074393824
Cosine similarity between 'peter' 'wendy' - Skip Gram :  0.40088683


In [8]:
model2 = Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
print("Cosine similarity between 'peter' 'wendy' - Skip Gram : ", model2.similarity('peter', 'wendy'))

[('scienc', 0.35739946365356445), ('science', 0.2760446071624756), ('think', 0.217257559299469), ('client', 0.19491459429264069), ('part', 0.19050131738185883), ('util', 0.18476800620555878), ('analsis', 0.1737504005432129), ('intern', 0.15601569414138794), ('domain', 0.15585266053676605), ('group', 0.14942587912082672)]


In [1]:
from gensim.models.fasttext import FastText as ft_gensim

stemmed = ['database', 'science', 'scientist', 'mgmt', 'microsoft', 'hire', 'develop', 'mentor', 'team', 'data',
           'scientist', 'define', 'dataloader', 'scienc', 'priority', 'deep', 'understand', 'learn', 'goal', 'collabor',
           'across', 'triple', 'group', 'set', 'team', 'shortterm', 'longterm', 'goal', 'act', 'strait', 'advisor',
           'leadership', 'influenc', 'future', 'direct', 'strategy', 'define', 'partnership', 'align', 'effect',
           'broad', 'analyt', 'effort', 'analyticsdata', 'team', 'drive', 'part', 'datadog', 'scienc', 'bi', 'common',
           'disciplin', 'microsoftprior', 'experi', 'hire', 'manage', 'runner', 'team', 'data', 'scientist', 'busi',
           'domain', 'experi', 'usage', 'analyt', 'must', 'experi', 'across', 'sever', 'relev', 'busi', 'domain',
           'util', 'critic', 'think', 'skill', 'concept', 'complex', 'busi', 'problem', 'salt', 'use', 'advanc',
           'analsis', 'large', 'scale', 'realworld', 'busi', 'data', 'set', 'candid', 'must', 'abl', 'independ',
           'execut', 'analyt', 'project', 'help', 'intern', 'client', 'understand']


def gen_words(stemmed):
    yield stemmed


model = ft_gensim(window=5, min_count=1, workers=4, sg=1)
model.build_vocab(gen_words(stemmed))

model.train(gen_words(stemmed), total_examples=model.corpus_count, epochs=model.iter)

model.wv.most_similar(positive=['scientist'])

AttributeError: 'FastText' object has no attribute 'iter'