In [4]:
import numpy as np
import pandas as pd

import pickle

from collections import defaultdict

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from textblob import TextBlob

import gensim
from gensim import corpora, models, similarities, matutils


import time
from time import sleep, time
from timeit import timeit

import re
from re import findall
import string

import matplotlib
from matplotlib import pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import MDS
import sklearn.datasets as dt
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from sklearn.decomposition import PCA



import random


pd.set_option("display.max_rows", None, "display.max_columns", None)

stop_words = stopwords.words('english')
stop_words.append('u')
stop_words.append('nt')


# Edited from from https://www.digitalocean.com/community/tutorials/
# how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

cases_dict={}
#cases_dict['u']='you'
cases_dict['ca']='california'
cases_dict['bloody']='blood'
cases_dict['burning']='burn'
cases_dict['derailment']='derailment'
cases_dict['explosion']='explode'
cases_dict['evacuation']='evacuate'
cases_dict['die']='death'
cases_dict['dead']='death'
cases_dict['collision']='collide'
cases_dict['bomber']='bomb'
cases_dict['destruction']='destroy'
cases_dict['fatality']='fatal'
cases_dict['gon']='gone'
cases_dict['terrorist']='terrorism'
cases_dict['thunder']='thunderstorm'
cases_dict['wreck']='wreckage'
cases_dict['wildfire']='fire'
cases_dict['cop']='police'




def special_cases(tokens):
    for i in range(len(tokens)):
        if tokens[i] in cases_dict:
            tokens[i]=cases_dict[tokens[i]]
#     if tokens in cases_dict:
#         tokens=cases_dict[tokens]
    return(tokens)

def unicodify(to_uni):
    return(''.join(r'\u{:04X}'.format(ord(chr)) for chr in to_uni))



def listify(to_listify, uni=False):
    if uni:
        listed=findall("'(.+?)'",to_listify)
        for i in range(len(listed)):
            listed[i]=unicodify(listed[i])
        return(listed)
    else:
        return(findall("'(.+?)'",to_listify))



def lemmatize_sentence(tokens): 
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence



def remove_noise(tweet_tokens, stop_words = stop_words):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|''(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token=re.sub('t.co[/a-z]+','',token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        token = re.sub("[',.0-9]","", token)
        token=special_cases(token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        elif tag.startswith('JJ') or tag.startswith('NNP'):
            pos = 'del'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        if pos!='del':
            token = lemmatizer.lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words and pos!='adj':
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [2]:
RECREATE_DATASET=False

if RECREATE_DATASET:
    train=pd.read_csv('train.csv')

    #Clean/tokenize text
    hyperlink_blocker=['http','//','amp','yr','gt','mh','na','th','nt','\x89ûª','\x89ûªs','\x89ûò','\x89û','fuck','shit']

    f=open('blacklist.txt')
    for item in f:
        hyperlink_blocker.append(item.replace('\n',''))
    f.close()

    tweet_list=train.text.tolist()
    tweet_bigrams=[]
    tweet_trigrams=[]

    for i in range(len(tweet_list)):
        t=word_tokenize(tweet_list[i])
        t=lemmatize_sentence(t)
        t=remove_noise(t,stop_words)
        t=special_cases(t)

        block_count=0
        for j in range(len(t)):
            for item in hyperlink_blocker:
                if item in t[j] or len(t[j])<3:
                    block_count+=1
                    t[j]='[[Hyperlink Blocked]]'
        for j in range(block_count):
            t.remove('[[Hyperlink Blocked]]')

        tweet_list[i]=t
        # Add bi/tri-grams in their separate lists
        bi, tri=[],[]
        for j in range(len(t)-1):
            bi.append(str(t[j]+'_'+t[j+1]))
            #if j<len(t)-2:
                #tri.append(str(t[j]+'_'+t[j+1]+'_'+t[j+2]))

        tweet_bigrams.append(bi)
    train['tokens']=tweet_list
    train['tokens2']=tweet_bigrams

    #train['tokens3']=tweet_trigrams

    ftl=[]
    for i in range(len(tweet_list)):
        ftl.append(" ".join(tweet_list[i]))

    train['cleaned']=ftl

    train.fillna('N/A', inplace=True)

    #Keep only tweets with more than 4 token words
    deleter=[]
    for i in range(len(train['tokens'])):
        if len(train['tokens'].iloc[i])<=4:
            deleter.append(True)
        else:
            deleter.append(False)
    train['delet']=deleter

    train=train[train['delet']!=True].reset_index().drop(['index'],axis=1)

    pol=[]
    sub=[]
    for text in train.text:
        pol.append(TextBlob(text).sentiment[0])
        sub.append(TextBlob(text).sentiment[1])
    train['pol']=pol
    train['sub']=sub

    allgrams=[]
    for i in range(len(train['text'])):
        allgrams.append(train['tokens'].iloc[i]+train['tokens2'].iloc[i])
    train['tokens_all']=allgrams

    train.to_csv('flattened_train.csv')
    train=pd.read_csv('flattened_train.csv').drop(['Unnamed: 0'],axis=1)

    train.drop_duplicates(subset='tokens',inplace=True)

# Checkpoint after cleaning/tokenizing

In [None]:
save_train=False
if save_train:
    train.to_csv('train_clean.csv')

In [5]:
train=pd.read_csv('train_clean.csv').drop(['Unnamed: 0'],axis=1)
train.fillna('N/A', inplace=True)

In [None]:
train[train['target']==1].reset_index()

# Run K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
RERUN_MODEL=False

if RERUN_MODEL:

    # X Clusters
    KM=KMeans()
    KM.fit(xtfa1)

    cluster_nums=KM.predict(xtfa1)

    plt.figure(figsize=[16,16])
    plt.scatter(xtfa1_md[:,0],xtfa1_md[:,1],alpha=cluster_nums==0,c=cluster_nums)

    train1['cluster']=cluster_nums
    train1[train1['cluster']==2]

    train1.drop(['cluster_20'],axis=1,inplace=True)

    train=pd.read_csv('train_clean.csv').drop(['Unnamed: 0'],axis=1)
    train.fillna('N/A', inplace=True)

    xtfa[i].any()

### I compared K-Means to LDA and LDA seems to be the better option here.

# LDA

In [6]:
RERUN_MODEL=True

if RERUN_MODEL:
    #train=pd.read_csv('train_clean.csv').drop(['Unnamed: 0'],axis=1)
    #train.fillna('N/A', inplace=True)

    #MAXDF=0.1
    #MINDF=0.005

    MAXDF=0.075
    MINDF=0.005

    last=-1
    deleter=[]
    while len(deleter)!=last:

        tokens_all=train.tokens+train.tokens2

        tf=TfidfVectorizer(stop_words=stop_words, max_df=MAXDF, min_df=MINDF,use_idf=False)
        xtf=tf.fit_transform(tokens_all)
        xtfa=xtf.toarray()
        #print('!')
        #throw()
        deleter=[]
        for i in range(len(xtfa)):
            if not xtfa[i].any():
                deleter.append(True)
            else:
                deleter.append(False)

        train['delet']=deleter
        train=train[train['delet']==False].reset_index().drop(['index'],axis=1)
        last=len(deleter)
        #print(last)

    train1=train[train['target']==1].copy()
    train1.reset_index(inplace=True)

    tokens_1=train1.tokens+train1.tokens2


    tf1=TfidfVectorizer(stop_words=stop_words, max_df=MAXDF, min_df=MINDF,use_idf=False)
    xtf1=tf1.fit_transform(tokens_1)

    xtfa1=xtf1.toarray()
    xtffn1=tf1.get_feature_names()


In [7]:
len(xtffn1)

307

In [None]:

if RERUN_MODEL:
    cluster_topics=16

    text_list=train1['tokens'].tolist()
    for i in range(len(text_list)):
        text_list[i]=listify(text_list[i])

    dictionary=corpora.Dictionary(text_list)
    doc_term_matrix = []

    for doc in train1['tokens']:
        doc_term_matrix.append(dictionary.doc2bow(listify(doc,True)))

    doc_word = tf1.transform(tokens_all).transpose()
    pd.DataFrame(doc_word.toarray(), tf1.get_feature_names()).head()

    corpus = matutils.Sparse2Corpus(doc_word)
    dictionary = corpora.Dictionary()
    id2word = dict((v, k) for k, v in tf1.vocabulary_.items())

    lda1 = models.LdaModel(corpus=corpus, num_topics=cluster_topics, id2word=id2word, passes=8)

    topic_matrix1=lda1.get_topics()

    cluster_scores=[]

    for r in range(len(train1['tokens_all'])):
        cluster_score=defaultdict(lambda:0)
        for item in listify(train1['tokens_all'].iloc[r]):
            if item in xtffn1:
                tok_num=xtffn1.index(item)
                for topic in range(len(topic_matrix1)):
                    cluster_score[topic]+=topic_matrix1[topic][tok_num]
        cluster_scores.append(cluster_score)

    hard_cluster=[]
    for defdict in cluster_scores:
        try:
            kmax = max(zip(defdict.values(), defdict.keys()))[1]
            hard_cluster.append(kmax)
        except:
            hard_cluster.append(0)

    train1=train1.reset_index().drop(['level_0'],axis=1)

    train1['lda_cnum']=hard_cluster

In [None]:
#lda1.show_topics(16)

In [None]:
#train1[train1['lda_cnum']==15].text

In [None]:
# Topics:

# Fire
# Storms/Flooding
# Injury
# Police
# Accidents
# Terrorism
# War
# Misc

In [None]:
#train1.to_csv('train1.csv')

#f = open('topic_matrix1', 'wb')
#pickle.dump(topic_matrix1, f)
#f.close()

# f = open('xtffn1', 'wb')
# pickle.dump(xtffn1, f)
# f.close()


In [None]:
train1=pd.read_csv('train1.csv')

f = open('topic_matrix1', 'rb') 
topic_matrix1=pickle.load(f)
f.close()

f = open('xtffn1', 'rb') 
xtffn1=pickle.load(f)
f.close()

In [None]:
def cluster_tweet(tweet):
    
    cluster_names={0:'Misc',
                1:'Fire',
                2:'Injury',
                3:'Police',
                4:'Fire',
                5:'Accidents',
                6:'Misc',
                7:'Injury',
                8:'Terrorism',
                9:'Storms/Flooding',
                10:'Police',
                11:'Misc',
                12:'Terrorism',
                13:'Storms/Flooding',
                14:'Misc',
                15:'War'}
    print(f'Original tweet:{tweet}\n')
    t=word_tokenize(tweet)
    t=lemmatize_sentence(t)         
    t=special_cases(t)
    print(f'Tokenized/Lemmatized sentence:{t}\n')
    
    no_match=True
    
    cluster_score=defaultdict(lambda:0)
    for item in t:
        if item in xtffn1:
            tok_num=xtffn1.index(item)
            for topic in range(len(topic_matrix1)):
                cluster_score[topic]+=topic_matrix1[topic][tok_num]
                no_match=False
    if no_match:
        print('Error: Unable to sort into clusters. Try a different tweet.')
        return('Error: Unable to sort into clusters.')
    
    list_form=[]
    for item in cluster_score:
        list_form.append(cluster_score[item])
        
    scale_factor=1/sum(list_form)
    
    out_dict=defaultdict(lambda:0)
    
    for idx in range(len(list_form)):
        list_form[idx]*=scale_factor*100
        out_dict[cluster_names[idx]]+=list_form[idx]

    return(out_dict)

In [None]:
def pick_random_tweet():
    return(train1.iloc[random.randint(0,len(train1.text))].text)

In [None]:
def get_tweet_index(idx):
    return(train1.iloc[idx].text)

In [None]:
# [(0,
#   '0.111*"miss" + 0.107*"long" + 0.101*"survive" + 0.071*"shoot" + 0.070*"deluge" + 0.064*"follow" + 0.063*"calgary" + 0.062*"major" + 0.059*"food" + 0.043*"flame"'),
#  (1,
#   '0.134*"burn" + 0.090*"mass" + 0.089*"high" + 0.087*"school" + 0.084*"whole" + 0.076*"build" + 0.070*"move" + 0.055*"refugee" + 0.044*"affect" + 0.043*"abc"'),
#  (2,
#   '0.181*"wound" + 0.092*"movie" + 0.077*"fatal" + 0.077*"leave" + 0.069*"woman" + 0.061*"nuclear" + 0.058*"old" + 0.049*"well" + 0.046*"release" + 0.039*"end"'),
#  (3,
#   '0.135*"officer" + 0.130*"collapse" + 0.087*"late" + 0.085*"aug" + 0.071*"hour" + 0.066*"line" + 0.062*"casualty" + 0.044*"murderer" + 0.040*"crime" + 0.039*"bridge"'),
#  (4,
#   '0.152*"see" + 0.126*"wild" + 0.093*"wild_fire" + 0.076*"car" + 0.076*"let" + 0.072*"never" + 0.064*"call" + 0.047*"sign" + 0.036*"send" + 0.036*"catch"'),
#  (5,
#   '0.133*"help" + 0.102*"top" + 0.094*"near" + 0.083*"keep" + 0.070*"collide" + 0.054*"person" + 0.050*"early" + 0.043*"rise" + 0.039*"oil" + 0.035*"deal"'),
#  (6,
#   '0.207*"trauma" + 0.093*"much" + 0.083*"give" + 0.081*"big" + 0.067*"scream" + 0.065*"terrorism" + 0.061*"week" + 0.059*"riot" + 0.058*"hope" + 0.053*"happen"'),
#  (7,
#   '0.099*"blood" + 0.086*"really" + 0.084*"building" + 0.069*"crash" + 0.069*"photo" + 0.067*"save" + 0.060*"half" + 0.049*"lose" + 0.041*"drive" + 0.041*"fight"'),
#  (8,
#   '0.112*"man" + 0.097*"bomb" + 0.093*"tragedy" + 0.069*"city" + 0.053*"set" + 0.043*"suicide" + 0.039*"island" + 0.039*"plane" + 0.033*"suicide_bomb" + 0.033*"malaysia"'),
#  (9,
#   '0.247*"wreckage" + 0.178*"weapon" + 0.054*"typhoon" + 0.047*"wind" + 0.045*"hiroshima" + 0.040*"nuclear" + 0.038*"soudelor" + 0.035*"typhoon_soudelor" + 0.035*"breaking" + 0.033*"hear"'),
#  (10,
#   '0.177*"police" + 0.111*"home" + 0.071*"ever" + 0.068*"suspect" + 0.049*"area" + 0.049*"police_officer" + 0.047*"inside" + 0.041*"summer" + 0.040*"murder" + 0.038*"battle"'),
#  (11,
#   '0.145*"need" + 0.116*"body" + 0.086*"many" + 0.081*"evacuate" + 0.057*"family" + 0.052*"group" + 0.052*"tonight" + 0.051*"blow" + 0.048*"force" + 0.047*"land"'),
#  (12,
#   '0.118*"attack" + 0.102*"head" + 0.087*"fall" + 0.083*"train" + 0.074*"flood" + 0.062*"change" + 0.058*"road" + 0.052*"rain" + 0.039*"kill" + 0.034*"india"'),
#  (13,
#   '0.110*"even" + 0.076*"injury" + 0.066*"red" + 0.058*"rescue" + 0.057*"military" + 0.054*"damage" + 0.050*"fear" + 0.047*"victim" + 0.044*"plan" + 0.043*"severe"'),
#  (14,
#   '0.115*"great" + 0.095*"life" + 0.088*"explode" + 0.080*"live" + 0.070*"night" + 0.070*"hit" + 0.051*"heat" + 0.045*"water" + 0.042*"place" + 0.042*"wave"'),
#  (15,
#   '0.179*"war" + 0.116*"emergency" + 0.083*"kill" + 0.055*"talk" + 0.055*"loud" + 0.050*"service" + 0.044*"large" + 0.035*"across" + 0.034*"morning" + 0.032*"survivor"')]
  
  
  
  
  
#   cluster_names={0:'Misc',
#                 1:'Fire',
#                 2:'Injury',
#                 3:'Police',
#                 4:'Fire',
#                 5:'Accidents',
#                 6:'Misc',
#                 7:'Injury',
#                 8:'Terrorism',
#                 9:'Storms/Flooding',
#                 10:'Police',
#                 11:'Misc',
#                 12:'Terrorism',
#                 13:'Storms/Flooding',
#                 14:'Misc',
#                 15:'War'}

# Display subjectivity/polarity

In [None]:
# blue='#1111CC'
# orange='#ED8811'

# plt.figure(figsize=[16,10])
# plt.title('Sentiment of Tweets')
# plt.xlabel('Polarity')
# plt.ylabel('Subjectivity')
# plt.scatter(train0['pol'],train0['sub'],alpha=0.25,color=blue)
# plt.scatter(train1['pol'],train1['sub'],alpha=0.25,color=orange)
# plt.legend(['"Safe"','"Disaster"'])

# pol0=train0['pol'].mean()
# pol1=train1['pol'].mean()
# sub0=train0['sub'].mean()
# sub1=train1['sub'].mean()

In [None]:
#plt.hist(train0['pol'])
#plt.hist(train1['pol'],color='orange')

#plt.hist(train0['sub'])
#plt.hist(train1['sub'],color='orange')

In [None]:
# plt.bar('Polarity of "Safe" Tweets',pol0,color=blue)
# plt.bar('Polarity of "Disaster" Tweets',pol1,color=orange)

In [None]:
# plt.bar('Subjectivity of "Safe" Tweets',sub0,color=blue)
# plt.bar('Subjectivity of "Disaster" Tweets',sub1,color=orange)

In [None]:
# from wordcloud import WordCloud
# from re import findall


In [None]:
# toks=[]
# for item in range(len(train1.tokens)):
#     s=train1.tokens.iloc[item]+train1.tokens2.iloc[item]
#     for tok in findall("'(.+?)'",train1.tokens.iloc[item]):
#         toks.append(tok)

# wordcloud = WordCloud(background_color="white", max_words=99999, contour_width=3, contour_color='steelblue')
# wordcloud.generate(','.join(toks))
# wordcloud.to_image()

In [None]:
# from scipy.io import mmread

# term_doc = mmread('../bbc/bbc.mtx')  # term-document matrix
# doc_term = term_doc.T  # document-term matrix


# from sklearn.decomposition import NMF

# nmf2 = NMF(n_components=2)
# doc_topic = nmf2.fit_transform(train.text)

# Appendix

In [None]:
#Old tokenization functions
# def pd_tokenize(pandas_series, blacklist=default_blacklist):
#     tokenized=[]
#     for _ in pandas_series:
#         for chara in blacklist:
#             _=_.replace(chara,' ')
#         tokens=word_tokenize(_.lower())
#         hyperlink_blocked=[]
#         for i in range(len(tokens)):
#             if len(re.findall('http',tokens[i]))>0:
#                 hyperlink_blocked.append(i)
#         hyperlink_blocked.reverse()
#         for i in hyperlink_blocked:
#             tokens.pop(i)
#         tokenized.append([lmt.lemmatize(word) for word in tokens])
#     return(pd.Series(tokenized))

# def pd_clean(pandas_series, blacklist=default_blacklist):
#     tokenized=[]
#     for _ in pandas_series:
#         for chara in blacklist:
#             _=_.replace(chara,' ')
#         tokens=word_tokenize(_.lower())
#         hyperlink_blocked=[]
#         for i in range(len(tokens)):
#             if len(re.findall('http',tokens[i]))>0:
#                 hyperlink_blocked.append(i)
#         hyperlink_blocked.reverse()
#         for i in hyperlink_blocked:
#             tokens.pop(i)
#         tokenized.append(" ".join([lmt.lemmatize(word) for word in tokens]))
#     return(pd.Series(tokenized))

In [None]:
#Used to count total importance of words
# count_importance=False
# if count_importance:
#     tweet_series_0_tf=pd.Series(name="count", data=0.0, index=tf0.get_feature_names())
#     for tnum in range(0,len(xtfa0)):
#         s=pd.Series(name="count", data=xtfa0[tnum], index=tf0.get_feature_names())
#         s=s[s>0]
#         s=pd.DataFrame(s)
#         s.reset_index(inplace=True)
#         for il in range(len(s)):
#             tweet_series_0_tf.at[s['index'].iloc[il]]+=s.iloc[il]['count']


#     tweet_series_1_tf=pd.Series(name="count", data=0.0, index=tf1.get_feature_names())
#     for tnum in range(0,len(xtfa1)):
#         s=pd.Series(name="count", data=xtfa1[tnum], index=tf1.get_feature_names())
#         s=s[s>0]
#         s=pd.DataFrame(s)
#         s.reset_index(inplace=True)
#         for il in range(len(s)):
#             tweet_series_1_tf.at[s['index'].iloc[il]]+=s.iloc[il]['count']


#     ts0=tweet_series_0_tf.copy()
#     ts0.sort_values(ascending=False, inplace=True)
#     ts0=pd.DataFrame(ts0)
#     ts0.reset_index(inplace=True)

#     ts1=tweet_series_1_tf.copy()
#     ts1.sort_values(ascending=False, inplace=True)
#     ts1=pd.DataFrame(ts1)
#     ts1.reset_index(inplace=True)

#     f=open('disaster_1000_words_tf.csv','w')
#     f.write('word,total_importance,importance_proportion,\n')
#     for _ in range(1000):
#         f.write(ts1.iloc[_]['index']+','+str(ts1.iloc[_]['count'])+','+str(ts1.iloc[_]['count']/len(ts1))[0:6]+',\n')
#     f.close()

#     f=open('safe_1000_words_tf.csv','w')
#     f.write('word,total_importance,importance_proportion,\n')
#     for _ in range(1000):
#         f.write(ts0.iloc[_]['index']+','+str(ts0.iloc[_]['count'])+','+str(ts0.iloc[_]['count']/len(ts0))[0:6]+',\n')

#     f.close()

In [None]:
#Used to count total importance of words
# count_importance=False
# if count_importance:
#     tweet_series_tf=pd.Series(name="count", data=0.0, index=tf.get_feature_names())
#     for tnum in range(0,len(xtfa)):
#         s=pd.Series(name="count", data=xtfa[tnum], index=tf.get_feature_names())
#         s=s[s>0]
#         s=pd.DataFrame(s)
#         s.reset_index(inplace=True)
#         for il in range(len(s)):
#             tweet_series_tf.at[s['index'].iloc[il]]+=s.iloc[il]['count']


#     ts=tweet_series_tf.copy()
#     ts.sort_values(ascending=False, inplace=True)
#     ts=pd.DataFrame(ts)
#     ts.reset_index(inplace=True)


#     f=open('500_words_tf.csv','w')
#     f.write('word,total_importance,importance_proportion,\n')
#     for _ in range(500):
#         f.write(ts.iloc[_]['index']+','+str(ts.iloc[_]['count'])+','+str(ts.iloc[_]['count']/len(ts))[0:6]+',\n')
#     f.close()

In [None]:
#I was going to take a random sample before modeling, but I decided to just wait the 20 minutes :/
# import random

# xtfa_rows=[]

# for i in range(len(xtfa)):
#     random.seed(i*10)
#     if random.random()<0.2:
#         xtfa_rows.append(True)
#     else:
#         xtfa_rows.append(False)
        
# xtfa_s=xtfa[xtfa_rows]
# xtfa_s.shape

In [None]:
# num_topics=36

# text_list=train['tokens'].tolist()
# for i in range(len(text_list)):
#     text_list[i]=listify(text_list[i])

# dictionary=corpora.Dictionary(text_list)
# doc_term_matrix = []

# for doc in train['tokens']:
#     doc_term_matrix.append(dictionary.doc2bow(listify(doc,True)))

# doc_word = tf.transform(tokens_all).transpose()
# pd.DataFrame(doc_word.toarray(), tf.get_feature_names()).head()

# corpus = matutils.Sparse2Corpus(doc_word)
# dictionary = corpora.Dictionary()
# id2word = dict((v, k) for k, v in tf.vocabulary_.items())


# lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=8)

In [None]:
# %%time


# # %%time

# # mds = MDS(random_state=0)
# # xtfa_md = mds.fit_transform(xtfa)
# # print(xtfa_md)
# # stress = mds.stress_
# # print(stress)

# # #Holy crap MDS does NOT scale well at ALL. ~7000 datapoints = 20 minutes.
# # #Okay I'm just gonna pickle these because I'm NOT doing that again.

# # f = open('xtfa_md', 'wb')
# # pickle.dump(xtfa_md, f)
# # f.close()

# # f = open('mds_s5', 'wb') 
# # pickle.dump(mds, f)
# # f.close()




# # Repeat for Disaster Tweets only


# mds1 = MDS(random_state=0)
# xtfa1_md = mds1.fit_transform(xtfa1)
# print(xtfa1_md)
# stress = mds1.stress_
# print(stress)


# f = open('xtfa1_md', 'wb')
# pickle.dump(xtfa1_md, f)
# f.close()

# f = open('mds1_s5', 'wb') 
# pickle.dump(mds1, f)
# f.close()



# # Repeat for Safe Tweets only


# mds0 = MDS(random_state=0)
# xtfa0_md = mds0.fit_transform(xtfa0)
# print(xtfa0_md)
# stress = mds0.stress_
# print(stress)


# f = open('xtfa0_md', 'wb')
# pickle.dump(xtfa0_md, f)
# f.close()

# f = open('mds0_s5', 'wb') 
# pickle.dump(mds0, f)
# f.close()

# def display_mds(xtfa_any,xtfa_mds, cnums, clusternum='all'):


#     plt.figure(figsize=[10,10])
#     if clusternum=='all':
#         plt.scatter(xtfa_md[:,0],xtfa_md[:,1],c=cluster_nums)
#     else:
#         plt.scatter(xtfa_md[:,0],xtfa_md[:,1],alpha=cluster_nums==clusternum,c=cluster_nums)