In [56]:
import multiprocessing
import re
from os import path
import pandas as pd
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.corpus import stopwords
from nltk.corpus import words
import string
import unidecode

In [57]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\anas_\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [105]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.




In [59]:
# IMPORT DATA
data = pd.read_csv( 'data/tweets.csv')

# Drop duplicates
data = data.drop_duplicates(subset=['id'])

data = data.reset_index().drop(columns=['index', 'id', 'Unnamed: 0'])
data.head()

#init
ps = PorterStemmer()

In [85]:
def preprocessText(corpus):
    
    # Lower
    corpus = corpus.lower()

    # Remove pictures
    regex = r'pic\.twitter\.com.*'
    corpus = re.sub(regex, '', corpus, 0, re.MULTILINE)
    preprocessed = list()
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations from string.
    # word_tokenize is used to tokenize the input corpus in word tokens.
    corpus = " ".join([i for i in word_tokenize(corpus) if i not in stopset])
    tok_doc = word_tokenize("".join(corpus))    
    stemmed_doc = [ps.stem(word) for word in tok_doc]
    preprocessed.append(" ".join(stemmed_doc))
    
    return preprocessed

def preprocessList(listText):
    preprocessed = list()
    for text in listText:
        preprocList = preprocessText(text)
        preprocessed.append(preprocList)
    return preprocessed

In [86]:
preprocessList(data.text)


[['statement'],
 ['realli america terribl'],
 ['media establish want race badli never drop race never let support maga'],
 ['certainli interest 24 hour'],
 ['debat poll look great thank maga americafirst'],
 ['say clinton campaign ’ anti-cathol bigotri http //bit.ly/2dcbtvkcrook'],
 ['thank maga americafirst'],
 ['cincinnati ohio tomorrow night 7:30pm- join ohiovotesearli votetrumppence16 ticket http //www.donaldjtrump.com/schedule/register/cincinnati-oh2/ …'],
 ['littl pick-up dishonest media incred inform provid wikileak dishonest rig system'],
 ['thank florida- movement never seen never seen let get votetrumppence16 11/8 maga'],
 ['foul mouth sen. john mccain beg support primari gave drop locker room remark'],
 ["disloy r 's far difficult crook hillari come side ’ know win teach"],
 ['except cheat berni nom dem alway proven far loyal republican'],
 ['nice shackl taken fight america way want'],
 ['weak ineffect leader paul ryan bad confer call member went wild disloyalti'],
 ['despit

In [90]:
def main():
    # get tokens
    preprocessed = []
    data.loc[:, 'tokens'] = preprocessList(data.text)
    

    sentences = []
    for ind in data.index:
        tweet_tokens = data['tokens'][ind]
        sentences.append(TaggedDocument(tweet_tokens, [ind]))

   

    #MODEL
    model = Doc2Vec(
        documents=sentences,
        min_count=1,  
        max_vocab_size=None,
        window=50,  # the # of words before and after to be used as context
        size=300,  # is the dimensionality of the feature vector
        workers=multiprocessing.cpu_count(),
        iter=200  # number of iterations (epochs) over the corpus)
    )

    model.save("model")

In [91]:
main()

In [101]:
def get_tweet_close(user_text):
    tokens = preprocessText(user_text)
    vector = model.infer_vector(tokens)

    result = []
    for tweet_id, confidence in model.docvecs.most_similar([vector], topn=20):
        tweet = data.iloc[tweet_id].to_dict()
        result.append({**tweet, 'confidence': confidence, 'retweet': bool(tweet['retweet'])})

    return result


In [102]:
def load_model():
        return Doc2Vec.load("model")

In [104]:
model = load_model()

get_tweet_close("Obama")

[{'date': '28 Feb 2013',
  'link': '/realDonaldTrump/status/307149567198101506',
  'retweet': False,
  'text': '"Being the best requires full-time attention and application.” – Midas Touch',
  'author': 'DonaldTrump',
  'tokens': '`` best requir full-tim attent applic . ” – mida touch',
  'confidence': 0.16125935316085815},
 {'date': '28 Nov 2012',
  'link': '/realDonaldTrump/status/273839299907170304',
  'retweet': False,
  'text': '"You can benefit from others’ wisdom. Not just their mistakes but the good decisions and insight they have to offer.” -- The Way To The Top',
  'author': 'DonaldTrump',
  'tokens': '`` benefit other ’ wisdom mistak good decis insight offer . ” -- way top',
  'confidence': 0.15264302492141724},
 {'date': '9 Feb 2015',
  'link': '/realDonaldTrump/status/564956187826081792',
  'retweet': True,
  'text': '"@auggerdogger:  @realDonaldTrump my son didn\'t know who Meatloaf was so showed him "Paradise by Dashboard lights" http://youtu.be/9NTDHjVKQyo\xa0"',
  'aut