In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

from google.colab import drive
drive.mount("/content/drive")

Found GPU at: /device:GPU:0
Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import string
from gensim import matutils, models
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def process_text(text):
    combined = ' '.join(text)
    return combined

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

def clean_text(text):
  stemmed_text = []
  porter_stemmer = PorterStemmer()
  result = re.sub('[^a-zA-Z]', ' ', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  result = result.lower()
  result = result.split()
  result = [r for r in result if r not in set(stopwords.words('english'))]
  stemmed_result = [porter_stemmer.stem(r) for r in result]
  stemmed_text.append(" ".join(stemmed_result))
  return stemmed_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data_x = pd.read_csv('/content/drive/My Drive/data/dataset.csv', delimiter=',')
data_y = pd.read_csv('/content/drive/My Drive/data/news_dataset.csv', delimiter=',')
text_x = data_x['text'].tolist()
text_y = data_y['text'].tolist()

text_combined_x = process_text(text_x)
text_combined_y = process_text(text_y)

text_cleaned_x = clean_text(text_combined_x)
text_cleaned_y = clean_text(text_combined_y)

combined_clean_text_dict = {"source_x":text_cleaned_x, "source_y":text_cleaned_y}
data_df = pd.DataFrame.from_dict(combined_clean_text_dict, orient='index', columns=['text'])
data_df

Unnamed: 0,text
source_x,smell hillari fear daniel greenfield shillman ...
source_y,muslim bust stole million gov benefit print pa...


### topic modeling for dataset 1

In [None]:
data_x = pd.read_csv('/content/drive/My Drive/data/dataset.csv', delimiter=',')
text_x_real = data_x[data_x['label']==1]['text'].tolist()
text_x_fake = data_x[data_x['label']==0]['text'].tolist()

text_combined_x_real = process_text(text_x_real)
text_combined_x_fake = process_text(text_x_fake)

text_cleaned_x_real = clean_text(text_combined_x_real)
text_cleaned_x_fake = clean_text(text_combined_x_fake)

combined_clean_text_dict = {"real_x":text_cleaned_x_real, "fake_x":text_cleaned_x_fake}
data_df = pd.DataFrame.from_dict(combined_clean_text_dict, orient='index', columns=['text'])
data_df

Unnamed: 0,text
real_x,kerri go pari gestur sympathi u secretari stat...
real_y,smell hillari fear daniel greenfield shillman ...


In [None]:
cv = CountVectorizer(stop_words='english')
#cv = CountVectorizer()
data_cv = cv.fit_transform(data_df.text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_df.index
data_dtm

Unnamed: 0,aa,aaa,aaaaaaaand,aaaaackkk,aaaaapkfhk,aaaahhhh,aaaand,aaaarrgh,aaab,aaahhh,aaarf,aab,aaba,aabfsv,aabg,aabo,aaccord,aachen,aacnr,aadhaar,aadhar,aadl,aadmi,aae,aaeeb,aaf,aafn,aag,aagi,aagxdwkrjpq,aah,aahd,aahi,aahwuhvvnh,aai,aaib,aaingr,aaj,aaja,aakar,...,zye,zyf,zygot,zyi,zyifjwyfdh,zyj,zyjdjtknan,zyklon,zymo,zynga,zyomso,zypri,zyri,zytnwhvr,zyuganov,zyz,zz,zzbluecomet,zzg,zzgaahg,zzi,zzjjpdaivn,zzll,zzlo,zzn,zzomtmd,zzpkpmpp,zzpx,zzpxelb,zzqvyk,zzsg,zztain,zztw,zzucqevt,zzuml,zzywyr,zzz,zzzzaaaacccchhh,zzzzzzzz,zzzzzzzzzzzzz
real_x,24,9,0,0,0,0,0,0,1,0,0,5,1,1,0,2,0,2,0,30,2,0,0,0,0,10,0,0,0,0,1,4,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3,0,5,1,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
real_y,57,11,1,1,1,1,1,1,0,1,1,10,0,0,4,0,1,3,1,0,1,6,2,2,5,2,2,1,1,1,0,0,1,1,0,2,1,5,1,1,...,1,1,2,2,2,1,2,1,1,2,1,0,0,2,2,1,7,1,2,1,2,1,1,1,1,1,1,1,1,2,1,1,1,1,3,2,1,3,1,1


In [None]:
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,real_x,real_y
aa,24,57
aaa,9,11
aaaaaaaand,0,1
aaaaackkk,0,1
aaaaapkfhk,0,1


In [None]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=20)
lda.print_topics()

[(0,
  '0.015*"trump" + 0.006*"said" + 0.005*"presid" + 0.005*"peopl" + 0.005*"state" + 0.004*"clinton" + 0.004*"like" + 0.004*"say" + 0.004*"time" + 0.004*"obama"'),
 (1,
  '0.019*"said" + 0.012*"trump" + 0.008*"state" + 0.006*"presid" + 0.005*"republican" + 0.005*"reuter" + 0.005*"year" + 0.004*"say" + 0.004*"govern" + 0.004*"new"')]

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=80)
lda.print_topics()

[(0,
  '0.019*"said" + 0.012*"trump" + 0.008*"state" + 0.006*"presid" + 0.005*"republican" + 0.005*"reuter" + 0.005*"year" + 0.004*"say" + 0.004*"govern" + 0.004*"new"'),
 (1,
  '0.015*"trump" + 0.005*"said" + 0.005*"presid" + 0.005*"peopl" + 0.005*"state" + 0.004*"clinton" + 0.004*"like" + 0.004*"say" + 0.004*"time" + 0.004*"obama"')]

In [None]:
corpus_transformed = lda[corpus]
list(zip([a for (a,b) in corpus_transformed], data_dtm.index))

[((0, 0.9812643), 'real_x'), ((0, 0.032224685), 'real_y')]

### topic modeling for dataset 2

In [None]:
data_y = pd.read_csv('/content/drive/My Drive/data/news_dataset.csv', delimiter=',')
text_y_real = data_y[data_y['label']==1]['text'].tolist()
text_y_fake = data_y[data_y['label']==0]['text'].tolist()

text_combined_y_real = process_text(text_y_real)
text_combined_y_fake = process_text(text_y_fake)

text_cleaned_y_real = clean_text(text_combined_y_real)
text_cleaned_y_fake = clean_text(text_combined_y_fake)

combined_clean_text_dict = {"real_y":text_cleaned_y_real, "fake_y":text_cleaned_y_fake}
data_df = pd.DataFrame.from_dict(combined_clean_text_dict, orient='index', columns=['text'])
data_df

Unnamed: 0,text
real_y,muslim bust stole million gov benefit print pa...
fake_y,muslim bust stole million gov benefit print pa...


In [None]:
cv = CountVectorizer(stop_words='english')
#cv = CountVectorizer()
data_cv = cv.fit_transform(data_df.text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_df.index
data_dtm

Unnamed: 0,aa,aaa,aaaaaaaaahhhhhhhhhhhhhhh,aaaaalmost,aaaaggg,aaaah,aaaahhh,aaaand,aaah,aaahhh,aaajam,aaajiao,aaargh,aab,aabb,aabd,aabenraa,aaboyl,aac,aachen,aadhar,aadi,aadmi,aae,aaeeb,aaerion,aaf,aagaard,aah,aaib,aaii,aaingr,aaj,aaja,aakar,aakhri,aakr,aalia,aalipour,aaliya,...,zwolenkiewicz,zwolennici,zwolennikami,zwozdiak,zwroci,zwrotu,zwvgl,zwxqtixrl,zwyci,zwzqj,zx,zxh,zxpw,zxr,zxua,zxycmwjdxj,zy,zyaira,zyanna,zybvwqhlbctginctv,zydeco,zydedco,zyfin,zygankow,zygar,zyklon,zylinderkopfdichtung,zyma,zynga,zytiga,zytsov,zyudeheim,zyudheim,zyuganov,zyxel,zyz,zz,zzk,zzz,zzzz
real_y,71,34,1,2,1,1,1,1,2,1,1,1,1,11,1,2,1,1,1,4,1,1,2,2,5,1,2,1,3,2,1,5,5,4,1,1,3,1,1,4,...,3,1,1,1,1,1,5,1,1,1,2,1,1,1,5,1,6,5,4,5,8,1,1,3,3,1,1,1,11,1,1,1,17,12,1,2,10,1,1,1
fake_y,56,13,0,0,1,0,0,0,2,1,0,0,1,11,0,0,1,0,1,3,1,0,2,2,5,0,1,0,1,2,1,5,5,1,1,1,0,0,0,4,...,0,1,1,1,1,1,5,1,1,1,2,1,1,1,5,1,6,0,0,5,0,0,0,3,3,0,1,0,0,0,1,0,0,9,0,2,5,0,1,0


In [None]:
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,real_y,fake_y
aa,71,56
aaa,34,13
aaaaaaaaahhhhhhhhhhhhhhh,1,0
aaaaalmost,2,0
aaaaggg,1,1


In [None]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=20)
lda.print_topics()

[(0,
  '0.008*"trump" + 0.005*"clinton" + 0.005*"state" + 0.004*"peopl" + 0.004*"year" + 0.004*"time" + 0.004*"said" + 0.004*"elect" + 0.004*"like" + 0.003*"new"'),
 (1,
  '0.009*"said" + 0.009*"trump" + 0.005*"peopl" + 0.005*"clinton" + 0.004*"state" + 0.004*"like" + 0.004*"say" + 0.004*"year" + 0.004*"new" + 0.004*"time"')]

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=80)
lda.print_topics()

[(0,
  '0.010*"trump" + 0.009*"said" + 0.005*"peopl" + 0.005*"state" + 0.005*"year" + 0.004*"like" + 0.004*"say" + 0.004*"time" + 0.004*"clinton" + 0.004*"new"'),
 (1,
  '0.006*"clinton" + 0.006*"trump" + 0.005*"peopl" + 0.005*"state" + 0.004*"hillari" + 0.004*"time" + 0.004*"elect" + 0.004*"like" + 0.003*"year" + 0.003*"american"')]

In [None]:
corpus_transformed = lda[corpus]
list(zip([a for (a,b) in corpus_transformed], data_dtm.index))

[((0, 0.8155475), 'real_y'), ((0, 0.106390886), 'fake_y')]

### two sources

In [None]:
cv = CountVectorizer(stop_words='english')
#cv = CountVectorizer()
data_cv = cv.fit_transform(data_df.text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_df.index
data_dtm

Unnamed: 0,aa,aaa,aaaaaaaaahhhhhhhhhhhhhhh,aaaaaaaand,aaaaackkk,aaaaalmost,aaaaapkfhk,aaaaggg,aaaah,aaaahhh,aaaahhhh,aaaand,aaaarrgh,aaab,aaah,aaahhh,aaajam,aaajiao,aaarf,aaargh,aab,aaba,aabb,aabd,aabenraa,aabfsv,aabg,aabo,aaboyl,aac,aaccord,aachen,aacnr,aadhaar,aadhar,aadi,aadl,aadmi,aae,aaeeb,...,zyma,zymo,zynga,zyomso,zypri,zyri,zytiga,zytnwhvr,zytsov,zyudeheim,zyudheim,zyuganov,zyxel,zyz,zz,zzbluecomet,zzg,zzgaahg,zzi,zzjjpdaivn,zzk,zzll,zzlo,zzn,zzomtmd,zzpkpmpp,zzpx,zzpxelb,zzqvyk,zzsg,zztain,zztw,zzucqevt,zzuml,zzywyr,zzz,zzzz,zzzzaaaacccchhh,zzzzzzzz,zzzzzzzzzzzzz
source_x,81,20,0,1,1,0,1,0,0,0,1,1,1,1,0,1,0,0,1,0,15,1,0,0,0,1,4,2,0,0,1,5,1,30,3,0,6,2,2,5,...,0,1,5,1,5,1,0,2,0,0,0,9,0,1,7,1,2,1,2,1,0,1,1,1,1,1,1,1,2,1,1,1,1,3,2,1,0,3,1,1
source_y,127,47,1,0,0,2,0,2,1,1,0,1,0,0,4,2,1,1,0,2,22,0,1,2,2,0,0,0,1,2,0,7,0,0,2,1,0,4,4,10,...,1,0,11,0,0,0,1,0,2,1,17,21,1,4,15,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0


In [None]:
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,source_x,source_y
aa,81,127
aaa,20,47
aaaaaaaaahhhhhhhhhhhhhhh,0,1
aaaaaaaand,1,0
aaaaackkk,1,0


In [None]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=20)
lda.print_topics()

[(0,
  '0.014*"trump" + 0.012*"said" + 0.007*"state" + 0.006*"presid" + 0.004*"peopl" + 0.004*"republican" + 0.004*"year" + 0.004*"say" + 0.004*"clinton" + 0.003*"new"'),
 (1,
  '0.008*"trump" + 0.006*"said" + 0.005*"clinton" + 0.005*"peopl" + 0.005*"state" + 0.004*"like" + 0.004*"year" + 0.004*"time" + 0.004*"new" + 0.003*"say"')]

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=80)
lda.print_topics()

[(0,
  '0.014*"trump" + 0.013*"said" + 0.007*"state" + 0.006*"presid" + 0.004*"peopl" + 0.004*"republican" + 0.004*"year" + 0.004*"say" + 0.004*"clinton" + 0.003*"new"'),
 (1,
  '0.008*"trump" + 0.006*"said" + 0.005*"clinton" + 0.005*"peopl" + 0.005*"state" + 0.004*"like" + 0.004*"year" + 0.004*"time" + 0.004*"new" + 0.003*"say"')]

In [None]:
corpus_transformed = lda[corpus]
list(zip([a for (a,b) in corpus_transformed], data_dtm.index))

[((0, 0.9419708), 'source_x'), ((0, 0.0850779), 'source_y')]