# Imports

In [3]:
import re
import pandas as pd
import numpy as np
import logging
from string import punctuation, digits, ascii_lowercase
from nltk.corpus import stopwords
from gensim.models import Word2Vec

In [4]:
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Data

In [5]:
bios_df = pd.read_csv('bio.csv')
caps_df = pd.read_csv('caption.csv')

In [6]:
bios = bios_df.as_matrix()

In [7]:
caps = caps_df.as_matrix()

# Cleaning Text

In [8]:
escapes = ''.join([chr(char) for char in range(1, 32)])
removeables = escapes + digits 
ig_adds = ['@','#']
ig_stops = ['com']
stops = [str(word) for word in stopwords.words('english')] + list(ascii_lowercase) + ig_stops

In [21]:
def parse_text(text):
    ''' This function takes a review string and removes all escape sequences,
        digits, punctuation, http links, and stop words. Furthermore, every
        word in the string will be stemmed using nltk's snowball stemmer.
        Every word is also transformed to be lowercase.'''
    
    text = re.sub(r"http\S+", " ", text)
    regex = re.compile('[%s]' % re.escape(punctuation+escapes))
    text = regex.sub(' ', text)
    text = text.translate(str.maketrans('','',removeables))
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in set(stops)])
    return text

In [22]:
# bios_split = [parse_text(bio[0]).split(' ') for bio in bios]
# caps_split = [parse_text(cap[0]).split(' ') for cap in caps]

# Model Training

In [24]:
sentences = bios_split+caps_split

In [25]:
model = Word2Vec(iter=5,workers=4,size=100,min_count=20)

In [26]:
model.build_vocab(sentences)

2017-03-05 16:27:06,044 : INFO : collecting all words and their counts
2017-03-05 16:27:06,046 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-03-05 16:27:06,094 : INFO : PROGRESS: at sentence #10000, processed 82692 words, keeping 29053 word types
2017-03-05 16:27:06,147 : INFO : PROGRESS: at sentence #20000, processed 164583 words, keeping 48363 word types
2017-03-05 16:27:06,195 : INFO : PROGRESS: at sentence #30000, processed 245994 words, keeping 64154 word types
2017-03-05 16:27:06,243 : INFO : PROGRESS: at sentence #40000, processed 328138 words, keeping 81052 word types
2017-03-05 16:27:06,312 : INFO : PROGRESS: at sentence #50000, processed 419591 words, keeping 97537 word types
2017-03-05 16:27:06,357 : INFO : PROGRESS: at sentence #60000, processed 518413 words, keeping 112024 word types
2017-03-05 16:27:06,399 : INFO : PROGRESS: at sentence #70000, processed 610840 words, keeping 124793 word types
2017-03-05 16:27:06,462 : INFO : PROGRESS: at

In [27]:
model.train(sentences)

2017-03-05 16:27:21,487 : INFO : training model with 4 workers on 51279 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-03-05 16:27:21,488 : INFO : expecting 1086736 sentences, matching count from corpus used for vocabulary survey
2017-03-05 16:27:22,511 : INFO : PROGRESS: at 2.39% examples, 1055982 words/s, in_qsize 7, out_qsize 0
2017-03-05 16:27:23,518 : INFO : PROGRESS: at 4.38% examples, 1023008 words/s, in_qsize 6, out_qsize 1
2017-03-05 16:27:24,532 : INFO : PROGRESS: at 6.49% examples, 1041377 words/s, in_qsize 7, out_qsize 0
2017-03-05 16:27:25,534 : INFO : PROGRESS: at 8.86% examples, 1064728 words/s, in_qsize 7, out_qsize 0
2017-03-05 16:27:26,538 : INFO : PROGRESS: at 11.05% examples, 1089224 words/s, in_qsize 7, out_qsize 0
2017-03-05 16:27:27,540 : INFO : PROGRESS: at 13.44% examples, 1094842 words/s, in_qsize 7, out_qsize 0
2017-03-05 16:27:28,546 : INFO : PROGRESS: at 15.78% examples, 1101409 words/s, in_qsize 7, out_qsize 0
2017-03-05

49612243

In [28]:
model.most_similar(positive=['sexy'])

2017-03-05 16:28:30,618 : INFO : precomputing L2-norms of word weight vectors


[('curves', 0.6837259531021118),
 ('diva', 0.674258828163147),
 ('underwear', 0.6699062585830688),
 ('hunk', 0.6642636060714722),
 ('fierce', 0.6609104871749878),
 ('curvy', 0.650741696357727),
 ('sassy', 0.6392788290977478),
 ('girly', 0.6227083802223206),
 ('brunette', 0.6218582391738892),
 ('skinny', 0.615222692489624)]

In [29]:
model.save('instagram_language.model')

2017-03-05 16:29:06,401 : INFO : saving Word2Vec object under instagram_language.model, separately None
2017-03-05 16:29:06,403 : INFO : not storing attribute syn0norm
2017-03-05 16:29:06,404 : INFO : not storing attribute cum_table
2017-03-05 16:29:07,288 : INFO : saved instagram_language.model
