In [1]:
import os
import sys
import re
from convokit import Corpus
from tqdm.notebook import tqdm, trange
import time
from collections import defaultdict, Counter
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
import contractions
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer 
import multiprocessing
num_cores = 10

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.max_length = 1800000
home_dir = "/Volumes/Extrema/wiki/conv_data"
model_dir = "/Users/Lara/Documents/Stanford/Research/Network/processed_data/lda_models"


In [2]:
wiki_corpus = Corpus(filename=os.path.join(home_dir, "wikiconv-2018"))



## Preparing Text for Topic Modeling

In [3]:
tokenizer = TweetTokenizer()
stop_words = STOP_WORDS
custom_stop_words = ['wiki', 'wikipedia', 'hi', 'page', 'talk', 'article']
stop_words = set(list(stop_words) + custom_stop_words)
# getting rid of these tokens that are not included in the pre-defined punctuation list
# no better lists existing
punctuation = string.punctuation + '–...…’“”•'
# removing bullet points and numbers
lemmatizer = WordNetLemmatizer()

In [5]:
import pandas as pd
bots_df = pd.read_csv('/Users/Lara/Documents/Stanford/Research/Network/processed_data/all_bots.csv')
bots = bots_df['0'].to_list()

LDA expects documents to be in a list format, where each element is a document. Given the short nature of comments, the reasonable document segmentation can either be a page or a single conversation. The former will tell us roughly what kind of conversations are going on in talk pages, while the latter tells us what are the minute details of each conversation. So let's go with the former first. We'll create two LDA model, for both user talk and talk pages, for each year.

In [4]:
def clean_text(text, package):
    """
    Parameters
    ----------
    text : str
        Text to be tokenized and lemmatized
    package : str
        One of "nltk" or "spacy". Specifies which pakcage to use for lemmatization.
        NTLK is faster, spaCy is more accurate and takes POS into account.
    Returns
    -------
    cleaned_toks : list of str
        A list of cleaned tokens
    """
    try:
        text = contractions.fix(text)
    except IndexError as e:
        print("%s led to an IndexError" % text)
        print(e)
    text = text.lower().strip()
    if package == 'spacy':
        cleaned_toks = [t.lemma_ for t in nlp(text) if re.match('^[a-z]+$', str(t)) and str(t) not in stop_words]
    elif package == 'nltk':
        cleaned_toks = []
        try:
            cleaned_toks = [lemmatizer.lemmatize(t) for t in tokenizer.tokenize(text)
                        if re.match('^[a-z]+$', t) and t not in stop_words]
        except Exception as e:
            print(e)
    return cleaned_toks

In [17]:
def parse_by_id(id_type, remove_bots):
    """
    Clean and separate all utterances into documents for topic modeling
    Parameters
    ----------
    id_type : str
        The type of identifier to be used for document separation, one of
        page_id or conversation_id
    remove_bots : bool
        If true, remove comments made by bots
    Returns
    -------
    id2texts : dict of {str : list of str}
        A dictionary mapping document IDs, whose type is specified by id_type,
        to a list of tokens
    """
    l = list(wiki_corpus.iter_utterances())
    id2texts=defaultdict(list)
    for utt in tqdm(l):
        if remove_bots:
            userid = utt.speaker.id
            if userid in bots:
                continue
        text = clean_text(utt.text, 'spacy')
        if id_type == 'page_id':
            doc_id, text = utt.get_conversation().meta[id_type], text
        elif id_type == 'conversation_id':
            doc_id, text = utt.get_conversation().id, text
        else:
            raise NotImplementedError("Document ID type not recognized")
        if len(text) > 0:
            id2texts[doc_id].extend(text)
    return id2texts

def separate_by_type(id2texts, id_type):
    """
    Separate documents by the type of talk page to which they belong
    Parameters
    ----------
    id2texts : dict of {str : list of str}
        A dictionary mapping document IDs to documents represented by a list of tokens
    id_type : str
        The type of identifier that is used for document separation, one of 
        page_id or conversation_id
    Returns
    -------
    (usertalks, talks, projtalks) : Tuple of (list of list of str, list of list of str, list of list of str)
        Returns a tuple of three lists that contains usertalk documents, talk
        documents, and project talk documents in that sequence
    """
    id2article_type = {}
    if id_type == 'page_id':
        for conv in tqdm(wiki_corpus.iter_conversations()):
            if conv.meta[id_type] not in id2article_type:
                id2article_type[conv.meta[id_type]] = conv.meta['page_type']
    elif id_type == 'conversation_id':
        for conv in tqdm(wiki_corpus.iter_conversations()):
            if conv.id not in id2article_type:
                id2article_type[conv.id] = conv.meta['page_type']
    else:
        raise NotImplementedError("Document ID type not recognized")
    usertalks, talks, projtalks = [], [], []
    for doc_id, texts in id2texts.items():
        page_type = id2article_type[doc_id]
        if page_type == 'user_talk':
            usertalks.append(texts)
        elif page_type == 'talk':
            talks.append(texts)
        elif page_type == 'wikipedia_talk':
            projtalks.append(texts)
    
    return (usertalks, talks, projtalks)

## Processing by page

In [None]:
pageid2texts = parse_by_id('page_id', remove_bots=True)
usertalks, talks, projtalks = separate_by_type(pageid2texts, 'page_id')

In [None]:
with open(os.path.join(model_dir, 'cleaned_talks_human.txt'), 'w') as f:
    for doc in talks:
        f.write(' '.join(doc) + '\n')

with open(os.path.join(model_dir, 'cleaned_user_talks_human.txt'), 'w') as f:
    for doc in usertalks:
        f.write(' '.join(doc) + '\n')

with open(os.path.join(model_dir, 'cleaned_proj_talks_human.txt'), 'w') as f:
    for doc in projtalks:
        f.write(' '.join(doc) + '\n')


## Processing by conversation

In [20]:
pageid2texts = parse_by_id('conversation_id', remove_bots = True)
usertalks, talks, projtalks = separate_by_type(pageid2texts, 'conversation_id')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5002011.0), HTML(value='')))

İlk WP maddemi açtım: Kurtuluş belgesel filmi! Lütfen biraz yardım et bana. Öncelikle maddeyi arayanların kolay bulması için birşeyler yapalım (Kurtuluş-Kurtulus vb). Çok ilginçtir, gemiyi ararken bile "Kurtuluş gemisi" yazınca çıkmıyor, SS yazmak ve "ş" kullanmak gerekiyor! ''Yani o madde de yardım bekliyor''. Hadi beni kutlamak yerine '''develop this article'''. Thanks in advance.  
Hayırlı olsun ) Aynı isimde başka madde yoksa parantez kullanımı gereksiz madde adlarında. Dolayısıyla parantezli kısmın kaldırılması gerek madde adından. Yine film, İngilizce gösterim adıyla adlandırılmalı. Bu film İngilizce olarak vizyona girdi mi? IMDb'de farklı bir adlandırılma yapılmış mesela. Aksi bir durum yoksa filmin orijinal adı olan Türkçe adı tercih edilmeli. Gemi maddesine gelirsek, sadece "Kurtuluş" adı alabilecek iki bir madde var burada: İstanbul'daki Kurtuluş semti ve Silifke'nin köyü olan Kurtuluş. Aklımıza ilk semt geldiğinden sade ad olan "Kurtuluş" adı, semti anlatan maddeye verilmiş,

 Draft:HİNDARX ŞƏHİDLƏRİ, a page which you created or substantially contributed to, has been nominated for deletion. Your opinions on the matter are welcome; you may participate in the discussion by adding your comments at Wikipedia:Miscellany for deletion/Draft:HİNDARX ŞƏHİDLƏRİ and please be sure to sign your comments with four tildes (~~~~). You are free to edit the content of Draft:HİNDARX ŞƏHİDLƏRİ during the discussion but should not remove the miscellany for deletion template from the top of the page; such a removal will not end the deletion discussion. Thank you.   led to an IndexError
string index out of range
Evlidir,üç qızı bir  oğlu, altı  nəvəsi var(İlham,Kənan,Murad,Mələk,Ziya,Elmir).
Xanımı Məmmədova Rəna İbrahim qızı orta məktəbdə müəllimədir.Böyük qızı Məmmədova (Mayılova) Aygün ixtisasca həkimdir ,alimlik dərəcəsi müdafiə etmişdir tibb üzrə fəlsəfə doktorudur;2-ci qızı Məmmədova Ülkər ixtisasca stomatoloqdur,tibb kollecində müəllimə kimi calışır;3-cü qızı Məmmədova (A

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [21]:
model_dir = "/Users/Lara/Documents/Stanford/Research/Network/processed_data/lda_models"
with open(os.path.join(model_dir, 'talks_by_conv_human.txt'), 'w') as f:
    for doc in talks:
        f.write(' '.join(doc) + '\n')

with open(os.path.join(model_dir, 'user_talks_by_conv_human.txt'), 'w') as f:
    for doc in usertalks:
        f.write(' '.join(doc) + '\n')

with open(os.path.join(model_dir, 'proj_talks_by_conv_human.txt'), 'w') as f:
    for doc in projtalks:
        f.write(' '.join(doc) + '\n')


### Efficient Lemmatizing Using Batch SpaCy NLP Pipeline

## Running Topic Models

In [9]:
def load_texts(model_dir, talk_fn, user_talk_fn, proj_talk_fn):
    """
    Loads cleaned texts from disk to save time from repeat processing.
    Parameters
    ----------
    model_dir : str
        Location of model
    talk_fn : str
        Filepath to cleaned talk pages
    user_talk_fn : str
        Filepath to cleaned user talk pages
    proj_talk_fn : str
        Filepath to cleaned project talk pages
    Returns
    -------
    (talks, usertalks, projtalks) : Tuple of lists
        A tuple of cleaned documents of three talk page types
    """
    talks, usertalks, projtalks = [], [], []
    with open(os.path.join(model_dir, talk_fn)) as f:
        for line in f:
            talks.append(line.split())

    with open(os.path.join(model_dir, user_talk_fn)) as f:
        for line in f:
            usertalks.append(line.split())

    with open(os.path.join(model_dir, proj_talk_fn)) as f:
        for line in f:
            projtalks.append(line.split())
    return (talks, usertalks, projtalks)

talks, usertalks, projtalks = load_texts(model_dir, "talks_by_conv_human.txt", "user_talks_by_conv_human.txt", "proj_talks_by_conv_human.txt")


In [22]:
def generate_lda(docs, num_topics, passes, fn, model_dir):
    """
    Workhorse function for generating LDA models.
    Parameters
    ----------
    docs : list of str
        A list of documents, where each item is a document
    num_topics : int
        The number of topics to train
    passes : int
        The number of times to iterate over the corpus
    fn : str
        The file prefix used for saving all corresponding files produced
        in the generation of topic models
    model_dir : str
        Path to output directory for LDA models and other auxilary files
    """
    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    pickle.dump(corpus, open(os.path.join(model_dir, fn+'_corpus.pkl'), 'wb'))
    dictionary.save(os.path.join(model_dir, fn+'_dictionary.gensim'))
    model = gensim.models.LdaMulticore(corpus, num_topics = num_topics, id2word=dictionary, passes=passes, workers=9)
    model.save(os.path.join(model_dir, fn+'_model.gensim'))
    

In [23]:
import gensim
from gensim import corpora
import pickle
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

NUM_TOPICS = 15

In [27]:
%%notify
generate_lda(docs=usertalks, num_topics=NUM_TOPICS, passes=10, fn='usertalk_by_conv_human', model_dir=model_dir)


In [26]:
generate_lda(docs=talks, num_topics=NUM_TOPICS, passes=10, fn='talk_by_conv_human', model_dir=model_dir)


In [28]:
generate_lda(docs=projtalks, num_topics=5, passes=5, fn='projtalk_by_conv_human', model_dir=model_dir)