In [12]:
import pandas as pd
import numpy as np
import pickle
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
df = pd.read_csv('../Data/Interim/hydrated_Tweet200316.csv', index_col=0)

In [14]:
# Clean tweets
df['CleanTweet'] = [re.sub("[^a-zA-Z0-9\s,.-_´&%'\":€$£!?']", '',  re.sub(' http\S+', '', re.sub('\s',' ', tw))).replace(u'\xa0', u' ').lower() if isinstance(tw, str) else '' for tw in df.tweet]

# Get bigrams
bigrams_tweets = [list(nltk.bigrams(tw)) for tw in df['CleanTweet']]

In [157]:
with open('../Data/Processed/Bigram_tweets.pickle', 'wb') as handle:
    pickle.dump(bigrams_tweets, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [110]:
unique_chars = sorted(''.join(set(''.join(df['CleanTweet'])))) # Assume all characters are in tweets from df1

unique_bigrams = [x+y for x in unique_chars for y in unique_chars]

bigram_mapper = dict(zip(unique_bigrams, range(len(unique_bigrams))))

In [117]:
X = vectorizer.fit_transform(df['CleanTweet'])

In [118]:
X.shape

(487972, 3481)

In [121]:
lda = LatentDirichletAllocation(n_components=5, verbose = 2, n_jobs = 3)
lda.fit(X[:10000])

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    6.7s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 1 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 2 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.8s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 3 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.8s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 4 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 5 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   

LatentDirichletAllocation(n_components=5, n_jobs=3, verbose=2)

In [136]:
topics = lda.transform(X[:10000]).argmax(axis = 1)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.2s finished


In [141]:
y = np.bincount(topics)
ii = np.nonzero(y)[0]
dict(zip(ii,y[ii]))

{0: 424, 1: 891, 2: 4264, 3: 2179, 4: 2242}

In [142]:
lda.transform(X[:10000])

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.5s finished


array([[0.16133972, 0.13859811, 0.69638492, 0.00183662, 0.00184063],
       [0.58946017, 0.01020377, 0.01020963, 0.01012878, 0.37999766],
       [0.00727604, 0.0072855 , 0.97082498, 0.00733381, 0.00727967],
       ...,
       [0.04489188, 0.00181722, 0.34178056, 0.39818723, 0.21332312],
       [0.12547465, 0.37938139, 0.12883567, 0.00448361, 0.36182468],
       [0.00400016, 0.11669848, 0.00396122, 0.18567844, 0.6896617 ]])

In [144]:
lda.score(X[:10000])

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.4s finished


-4636467.336107308

In [146]:
vectorizerTfidf = TfidfVectorizer(vocabulary = bigram_mapper, ngram_range = (2,2), analyzer = "char")

In [147]:
X = vectorizerTfidf.fit_transform(df['CleanTweet'])

In [148]:
lda = LatentDirichletAllocation(n_components=5, verbose = 2, n_jobs = 3)
lda.fit(X[:10000])

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.1s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 1 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.8s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 2 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.7s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 3 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.6s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 4 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    1.7s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
iteration: 5 of max_iter: 10
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:   

LatentDirichletAllocation(n_components=5, n_jobs=3, verbose=2)

In [149]:
lda.score(X[:10000])

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.7s finished


-455997.85389304534

In [3]:
import fasttext

In [27]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 12.6 MB/s 
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-3.0.0.tar.gz (113 kB)
[K     |████████████████████████████████| 113 kB 10.6 MB/s 
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-3.0.0-py3-none-any.whl size=107095 sha256=0fc7f64dbb397e51a7e1259e5732567068a88d978f385569b46af4a207ef1a78
  Stored in directory: /Users/toke/Library/Caches/pip/wheels/83/a6/12/bf3c1a667bde4251be5b7a3368b2d604c9af2105b5c1cb1870
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-3.0.0


In [5]:
import pickle
with open('../Data/Processed/200316.pkl', 'rb') as handle:
    b = pickle.load(handle)

In [24]:
fasttext.train_unsupervised(" ".join(df['CleanTweet'][:10]),  model='cbow', minCount = 1)

ValueError: largescale testing for covid19 and wide availability of testing on demand would seem a nobrainer in early detect stories on covid_19  smart. more of this, please. not going to wear a bra for a month.   socialisolation coronavirus a covid19 coronavirus update from concerned physicians via @kevinmd start thinking about holy week and easter... white house urges americans not to hoard as coronavirus death toll hits 62 do any of you get smithsonian channel? search tonight 'america's hidden storiespandemic 1918' rod i love you bro but i promise i thought this was you   top brazilian fintwit influencers march 413, the coronavirus selloff: tier 8 cont. @brolo_rodrigo cannot be opened for training!

In [25]:
text = " ".join(df['CleanTweet'])
f = open('test', 'w')
f.write(text.encode('utf8'))
f.close()

TypeError: write() argument must be str, not bytes

In [2]:
import gensim.downloader as api

In [3]:
info = api.info()

In [4]:
model = api.load("glove-twitter-200")

In [5]:
model.most_similar("trump")

[('donald', 0.6552975177764893),
 ('clinton', 0.5108779668807983),
 ('biden', 0.5093722939491272),
 ('romney', 0.5069374442100525),
 ('warren', 0.4898340106010437),
 ('birther', 0.4887540936470032),
 ('judd', 0.47777795791625977),
 ('bloomberg', 0.4759959876537323),
 ('ivanka', 0.4735480546951294),
 ('reid', 0.4727659821510315)]

In [7]:
model2 = api.load("word2vec-google-news-300")



In [8]:
model2.most_similar("trump")

[('trumps', 0.7198434472084045),
 ('trumping', 0.5805853009223938),
 ('supersede', 0.5600423216819763),
 ('trumped', 0.5497317910194397),
 ('supercede', 0.5309919118881226),
 ('prevail', 0.487763375043869),
 ('outweigh', 0.4785327911376953),
 ('trample', 0.4714253544807434),
 ('overshadow', 0.47011539340019226),
 ('dictate', 0.46754559874534607)]

In [26]:
print(model.similarity('white','criminal'))
print(model.similarity('black','criminal'))
print(model.similarity('nigger','criminal'))

0.29550153
0.29352903
0.13284826


In [25]:
print(model2.similarity('white','criminal'))
print(model2.similarity('black','criminal'))
print(model2.similarity('nigger','criminal'))

0.04107806
0.08380792
0.15167528


In [30]:
model.most_similar("electiom")

KeyError: "word 'electiom' not in vocabulary"

In [42]:
model['corona']

array([-1.1476  ,  0.24372 , -0.44205 , -0.36565 , -0.012016, -0.31824 ,
       -0.9102  ,  0.55994 , -0.20795 , -0.41888 , -0.48952 , -0.28316 ,
       -0.5951  , -0.18151 ,  0.61523 , -0.28698 , -0.19773 , -0.023651,
        0.0575  ,  0.5552  ,  0.051025, -0.11355 ,  0.42244 ,  0.2664  ,
       -0.23722 , -1.5907  ,  0.23997 , -0.43092 ,  0.031012, -0.31035 ,
       -0.20378 , -0.14473 , -0.080509, -0.088075,  0.11065 ,  0.48304 ,
        0.42847 ,  0.48379 , -0.11828 ,  0.17017 ,  1.0133  ,  0.36426 ,
       -0.36842 , -0.24873 ,  0.12573 , -0.10963 ,  0.23704 ,  0.1857  ,
       -0.21525 ,  0.21762 ,  0.0513  , -0.33661 ,  0.40283 , -0.19627 ,
       -0.10017 , -0.057906,  0.12301 , -0.16951 , -0.10937 , -0.48194 ,
        0.026439, -0.53014 , -0.56541 , -0.54095 ,  0.5213  , -0.38697 ,
       -0.017976, -0.66467 ,  0.20409 , -0.72905 ,  0.26052 , -0.43418 ,
       -0.21438 ,  0.41414 ,  0.34679 ,  0.28401 , -1.0981  ,  0.042206,
        0.13505 , -0.13117 ,  0.43902 , -0.21741 , 