In [6]:
import multiprocessing
from gensim.models.doc2vec import TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter

cores = multiprocessing.cpu_count()

In [3]:
from load_data import load_data
data1_path = "../data/actor_words.csv"
data2_path = "../data/actors_70s.csv"
df_1000p = load_data(data1_path, data2_path)
df_1000p.head()

Unnamed: 0,actor_id,actor_name,words
0,nm0005211,Danica McKellar,danica mckellarmckellar 2018u. . nation book f...
1,nm0005576,Drea de Matteo,drea de matteod matteo 2005bornandrea donna de...
2,nm0028846,Shawn Andrews,"american footbal guard tackl actor , see shawn..."
3,nm0036571,Monica,"look monica monica wiktionari , free dictionar..."
4,nm0038875,John Asher,thi biographi live person need addit citat ver...


In [7]:
assigned_cluster_datasets = df_1000p.copy()
# Limit the max_features 
vectorizer = TfidfVectorizer(max_features=1000)
X_8 = vectorizer.fit_transform(df_1000p['words'])
features = vectorizer.get_feature_names()

# clusters = default 8
kmeans_n8 = KMeans()
kmeans_n8.fit(X_8)
top_centroids = kmeans_n8.cluster_centers_.argsort()[:,-1:-11:-1]
assigned_cluster = kmeans_n8.transform(X_8).argmin(axis=1)
assigned_cluster_datasets['assigned_cluster'] = assigned_cluster

In [46]:
assigned_cluster_datasets.head()

Unnamed: 0,actor_id,actor_name,words,assigned_cluster
0,nm0005211,Danica McKellar,danica mckellarmckellar 2018u. . nation book f...,7
1,nm0005576,Drea de Matteo,drea de matteod matteo 2005bornandrea donna de...,4
2,nm0028846,Shawn Andrews,"american footbal guard tackl actor , see shawn...",3
3,nm0036571,Monica,"look monica monica wiktionari , free dictionar...",5
4,nm0038875,John Asher,thi biographi live person need addit citat ver...,6


In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(assigned_cluster_datasets, test_size=0.3, random_state=42)

In [16]:
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
#     tokens = []
#     for sent in nltk.sent_tokenize(text):
#         for word in nltk.word_tokenize(sent):
#             if len(word) < 2:
#                 continue
#             tokens.append(word.lower())
    return text

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['words']), tags=[r.assigned_cluster]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['words']), tags=[r.assigned_cluster]), axis=1)

In [17]:
from gensim.models import Doc2Vec
from tqdm import tqdm
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 700/700 [00:00<00:00, 83764.02it/s]


In [20]:
from sklearn import utils

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 700/700 [00:00<00:00, 223457.86it/s]
100%|██████████| 700/700 [00:00<00:00, 889700.85it/s]
100%|██████████| 700/700 [00:00<00:00, 884874.26it/s]
100%|██████████| 700/700 [00:00<00:00, 911239.23it/s]
100%|██████████| 700/700 [00:00<00:00, 710898.98it/s]
100%|██████████| 700/700 [00:00<00:00, 781270.04it/s]
100%|██████████| 700/700 [00:00<00:00, 422204.89it/s]
100%|██████████| 700/700 [00:00<00:00, 1005828.30it/s]
100%|██████████| 700/700 [00:00<00:00, 817830.86it/s]
100%|██████████| 700/700 [00:00<00:00, 840782.59it/s]
100%|██████████| 700/700 [00:00<00:00, 833147.79it/s]
100%|██████████| 700/700 [00:00<00:00, 1008592.51it/s]
100%|██████████| 700/700 [00:00<00:00, 865825.07it/s]
100%|██████████| 700/700 [00:00<00:00, 930295.56it/s]
100%|██████████| 700/700 [00:00<00:00, 1075068.77it/s]
100%|██████████| 700/700 [00:00<00:00, 1003422.01it/s]
100%|██████████| 700/700 [00:00<00:00, 938623.02it/s]
100%|██████████| 700/700 [00:00<00:00, 1031265.47it/s]
100%|██████████| 700/70

In [32]:
from sklearn.linear_model import LogisticRegression
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words.split(' '), steps=20)) for doc in sents])
    return targets, regressors 

# def vec_for_learning(model, tagged_docs):
#     sents = tagged_docs.values
#     targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
#     return targets, regressors

In [33]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.35
Testing F1 score: 0.3383414976955238


In [34]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 700/700 [00:00<00:00, 92519.47it/s]


In [35]:
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 700/700 [00:00<00:00, 272914.37it/s]
100%|██████████| 700/700 [00:00<00:00, 926771.72it/s]
100%|██████████| 700/700 [00:00<00:00, 238138.76it/s]
100%|██████████| 700/700 [00:00<00:00, 614743.05it/s]
100%|██████████| 700/700 [00:00<00:00, 849295.00it/s]
100%|██████████| 700/700 [00:00<00:00, 540702.17it/s]
100%|██████████| 700/700 [00:00<00:00, 963574.93it/s]
100%|██████████| 700/700 [00:00<00:00, 1069196.21it/s]
100%|██████████| 700/700 [00:00<00:00, 1119333.89it/s]
100%|██████████| 700/700 [00:00<00:00, 878256.90it/s]
100%|██████████| 700/700 [00:00<00:00, 714706.13it/s]
100%|██████████| 700/700 [00:00<00:00, 1072712.02it/s]
100%|██████████| 700/700 [00:00<00:00, 645419.39it/s]
100%|██████████| 700/700 [00:00<00:00, 347169.54it/s]
100%|██████████| 700/700 [00:00<00:00, 847334.14it/s]
100%|██████████| 700/700 [00:00<00:00, 1091048.98it/s]
100%|██████████| 700/700 [00:00<00:00, 1000686.03it/s]
100%|██████████| 700/700 [00:00<00:00, 572992.35it/s]
100%|██████████| 700/70

In [36]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.6233333333333333
Testing F1 score: 0.6090011593217127




In [37]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [41]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [44]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words.split(" "), steps=20)) for doc in sents])
    return targets, regressors

In [45]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.5733333333333334
Testing F1 score: 0.5711116084140309


