In [6]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation,TruncatedSVD
from sklearn.metrics.pairwise import euclidean_distances

In [8]:
#Data import and preprocessing
df=pd.read_csv('twitter_data.txt', sep = "\n", header=None, names=["content"],error_bad_lines=False)


data=df.content.values.tolist()

#Basic cleaning
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data_words = [re.sub("\'", "", sent) for sent in data]

print(data_words[:1])

stop_words = stopwords.words('english')
stop_words.extend(['que','los','don','htt','https','por','la','al','se','le','el','30','10','del','amp'])

cleaned_data=[data for data in data_words if not data in stop_words]
print(cleaned_data[:1])

['Thank you Louisville, Kentucky- on my way! #MAGA🇺🇸 -']
['Thank you Louisville, Kentucky- on my way! #MAGA🇺🇸 -']


In [10]:
#Modelling
no_features=100
NUM_TOPICS=8

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
count_vectorized = count_vectorizer.fit_transform(cleaned_data)
count_feature_names = count_vectorizer.get_feature_names()


# NMF can use both countvector or  tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf_vectorized = tfidf_vectorizer.fit_transform(cleaned_data)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(count_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(tfidf_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(count_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 




(33107, 8)
(33107, 8)
(33107, 8)


In [11]:
#first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[0.0625 0.0625 0.0625 0.0625 0.5625 0.0625 0.0625 0.0625]
[0.00012822 0.00019631 0.00023452 0.00092445 0.00170599 0.00022699
 0.00077469 0.00319967]
[ 0.00716199 -0.00019003 -0.00231519  0.01191622  0.0128451  -0.00113862
 -0.00405776 -0.01139345]


In [19]:
#Display topic word distribution
def display_topics(model, vectorizer, top_n=5):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [20]:
print("LDA Model:")
display_topics(lda_model, count_vectorizer)
print("=" * 15)

print("NMF Model:")
display_topics(nmf_model, tfidf_vectorizer)
print("=" * 15)
 
print("LSI Model:")
display_topics(lsi_model, count_vectorizer)
print("=" * 15)

LDA Model:
Topic 0:
[('amp', 1773.6932405424664), ('rt', 1515.6760970453167), ('new', 1194.3209124155285), ('york', 289.6567359642981), ('music', 269.4999420674596)]
Topic 1:
[('https', 15951.753959375315), ('time', 420.9189787443214), ('day', 397.4318379919141), ('news', 376.3236005449662), ('american', 330.16749123842317)]
Topic 2:
[('https', 1031.283954465345), ('rt', 456.13435829626764), ('great', 352.6649312973796), ('trump', 338.04208682701324), ('love', 337.7197282626997)]
Topic 3:
[('https', 1635.634019949097), ('la', 975.3336418581357), ('rt', 916.6038888066778), ('en', 724.3644596128928), ('el', 465.3504368534968)]
Topic 4:
[('https', 1230.1250002952183), ('just', 531.1078133861898), ('today', 408.53077748280657), ('john', 343.2986805685746), ('good', 293.4135604796725)]
Topic 5:
[('rt', 8232.056848697353), ('big', 217.15915451860909), ('going', 184.75444047397775), ('party', 183.65864880712726), ('11', 168.80534627135776)]
Topic 6:
[('like', 557.3336371004631), ('gt', 392.39

In [23]:
#Transforming an unseen document
text = "The economy is working better than ever"
x = nmf_model.transform(tfidf_vectorizer.transform([text]))[0]
print(x)

[0. 0. 0. 0. 0. 0. 0. 0.]


In [24]:
#similarity functionality like gensim
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Florida 16-8


In [26]:
# visualization
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, count_vectorized, count_vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Perplexity:  -10.435327106471307

Coherence Score:  0.4757935386133725


Num Topics = 2  has Coherence Value of 0.4938
Num Topics = 4  has Coherence Value of 0.4483
Num Topics = 6  has Coherence Value of 0.4489
Num Topics = 8  has Coherence Value of 0.4244


"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

'Thank you Louisville, Kentucky- on my way! #MAGA🇺🇸 -'

NameError: name 'X' is not defined

NameError: name 'lda_model' is not defined

NameError: name 'x' is not defined

NameError: name 'nmf_model' is not defined