In [2]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
# import en_core_web_lg

from tqdm import tqdm_notebook as tqdm
from pprint import pprint

In [11]:
df = pd.read_csv('tweets.csv')
docs = df['tweets']

In [12]:
docs

0        ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1        ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...
2        ENGLISH TRANSLATION: FIRST AUDIO MEETING WITH ...
3        ENGLISH TRANSLATION: SHEIKH NASIR AL WUHAYSHI ...
4        ENGLISH TRANSLATION: AQAP: 'RESPONSE TO SHEIKH...
5        THE SECOND CLIP IN A DA'WAH SERIES BY A SOLDIE...
6        ENGLISH TRANSCRIPT : OH MURABIT! : http://t.co...
7        ENGLISH TRANSLATION: 'A COLLECTION OF THE WORD...
8        Aslm Please share our new account after the pr...
9        ENGLISH TRANSLATION: AQAP STATEMENT REGARDING ...
10       @KhalidMaghrebi @seifulmaslul123 @CheerLeadUni...
11       NEW LINK, AFTER PREVIOUS ONE TAKEN DOWN:AQAP-'...
12       ENGLISH TRANSLATION- SHEIKH ABU HASAN AL KUWAI...
13       SHEIKH ABU HASSAN AL KUWAITI (HA): ADVICE OF N...
14       @IbnNabih1 @MuwMedia @Dawlat_islam7 Not transl...
15       Aslm, anybody translating the new JN video? Wi...
16       @IbnNabih1 @KhalidMaghrebi_ @MuwMedia @Polder_.

In [87]:
import re
def label_lng (row):
    if "ENGLISH TRANSLATION:" in row['tweets']:
        return 'ar'
    elif re.match (r'[\u0621-\u064A]', row['tweets']):
        return 'ar'
    else:
        return 'en'
    
def check_readable (row):
    if len(row["txt"]) > 2 and row["lng"] == "en":
        return 1
    else: return 0
    
    
df['lng'] = df.apply (lambda row: label_lng(row), axis=1)
df['txt'] = df['tweets'].str.replace('http\S+|www.\S+', '', case=False)
df['txt'] = df['txt'].str.replace('(\#|\@)([\w]+)', '', case=False, regex=True)

df['txt'] = df['txt'].apply(lambda x: x.replace("ENGLISH TRANSLATION: ", ''))

df['txt'] = df['txt'].str.replace('\n', ' ', case=False, regex=True)
df['txt'] = df['txt'].str.replace('\s+', ' ', case=False, regex=True)
df['rb'] = df.apply (lambda row: check_readable(row), axis=1)

In [89]:
df.iloc[500]

name                                                  The Witnesser
username                                              Bajwa47online
description       The Dust Will Never Settle Down  \n           ...
location                                                        NaN
followers                                                       427
numberstatuses                                                 1953
time                                               10/25/2015 17:10
tweets            @Jihad67a akhi, ya'ni now, after punishment in...
lng                                                              en
txt                akhi, ya'ni now, after punishment in dunya, t...
rb                                                                1
Name: 500, dtype: object

In [90]:
nlp= spacy.load("en")

In [91]:
from spacy.lang.en.stop_words import STOP_WORDS
stop_list = []

nlp.Defaults.stop_words.update(stop_list)


# # Iterates over the words in the stop words list and resets the "is_stop" flag.
# for word in STOP_WORDS:
#     lexeme = nlp.vocab[word]
#     lexeme.is_stop = True
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [92]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [93]:
sel = df.loc[df['rb'] == 1, 'txt']

In [94]:
doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm(sel):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

HBox(children=(IntProgress(value=0, max=16803), HTML(value='')))




In [95]:
# 17222 texts
# 16 803
doc_list[9]

['  ', 'half', 'way', 'day', 'inshallah', 'busy']

In [96]:
words = corpora.Dictionary(doc_list)
corpus = [words.doc2bow(doc) for doc in doc_list]

In [97]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [99]:
pprint(lda_model.print_topics(num_words=10))

[(0,
  '0.029*"|" + 0.026*"في" + 0.020*"من" + 0.015*"IED" + 0.014*"📷" + '
  '0.013*"egyptian" + 0.011*"على" + 0.010*"  " + 0.008*"▪" + 0.007*"الدولة"'),
 (1,
  '0.046*"Al" + 0.019*"state" + 0.018*"Nusra" + 0.018*"islamic" + 0.015*"ally" '
  '+ 0.015*"Ahrar" + 0.013*"Sham" + 0.010*"Abu" + 0.008*"al" + 0.008*"||"'),
 (2,
  '0.018*"position" + 0.018*"east" + 0.016*"south" + 0.015*"😂" + '
  '0.014*"kurdish" + 0.013*"eastern" + 0.013*"Northern" + 0.012*"dozen" + '
  '0.012*"Fallujah" + 0.012*"militia"'),
 (3,
  '0.017*"release" + 0.015*"Abu" + 0.014*"Al" + 0.013*"apostate" + '
  '0.011*"member" + 0.010*"Qaeda" + 0.009*"link" + 0.008*"pro" + '
  '0.008*"gathering" + 0.008*"lol"'),
 (4,
  '0.027*"RT" + 0.012*"الله" + 0.009*"body" + 0.009*"Shia" + 0.009*"10" + '
  '0.009*"account" + 0.008*"massive" + 0.007*"government" + 0.007*"ask" + '
  '0.007*"footage"'),
 (5,
  '0.126*"  " + 0.026*"ISIS" + 0.026*"RT" + 0.016*"State" + 0.016*"Islamic" + '
  '0.014*"Assad" + 0.013*"amp" + 0.013*"fight" + 0.0

In [104]:
def get_topic(intext):
    bow = nlp(intext)
    lis = lda_model.get_document_topics(words.doc2bow(bow))
    res  = max(lis, key=lambda x:x[1])
    return res[0]
    

In [105]:
# sel
get_topic(sel.iloc[10])

5

In [106]:
df['topic'] = df.apply (lambda row: get_topic(row["txt"]), axis=1)

In [108]:
df[['username', 'txt', 'rb', 'topic']] 

Unnamed: 0,username,txt,rb,topic
0,GunsandCoffee70,'A MESSAGE TO THE TRUTHFUL IN SYRIA - SHEIKH A...,0,1
1,GunsandCoffee70,SHEIKH FATIH AL JAWLANI 'FOR THE PEOPLE OF INT...,0,1
2,GunsandCoffee70,FIRST AUDIO MEETING WITH SHEIKH FATIH AL JAWLA...,0,1
3,GunsandCoffee70,"SHEIKH NASIR AL WUHAYSHI (HA), LEADER OF AQAP:...",0,1
4,GunsandCoffee70,AQAP: 'RESPONSE TO SHEIKH BAGHDADIS STATEMENT ...,0,1
5,GunsandCoffee70,THE SECOND CLIP IN A DA'WAH SERIES BY A SOLDIE...,1,9
6,GunsandCoffee70,ENGLISH TRANSCRIPT : OH MURABIT! :,1,3
7,GunsandCoffee70,'A COLLECTION OF THE WORDS OF THE U'LAMA REGAR...,0,5
8,GunsandCoffee70,Aslm Please share our new account after the pr...,1,4
9,GunsandCoffee70,AQAP STATEMENT REGARDING THE BLESSED RAID IN F...,0,6


In [109]:
df.to_csv('result.csv')