# MODELING

In [4]:
from tqdm import tqdm
from collections import defaultdict
import gensim
from gensim.corpora import Dictionary
from gensim.models import Phrases
from gensim.models import LdaModel
import pyLDAvis.gensim
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from pymystem3 import Mystem
import pandas as pd
import geopandas as gpd
import folium
from folium.plugins import HeatMap
import seaborn as sns
%matplotlib inline

  regargs, varargs, varkwargs, defaults = inspect.getargspec(func)


# Select the neighborhood and load its data

In [5]:
name = 'Horoshevo-Mnevniki'
district = gpd.read_file('../Data dive/dd2/{}/{}_districts.geojson'.format(name,name))
neigh_posts = pd.read_csv('social_media/{}/vk.csv'.format(name))

In [6]:
neigh_posts.head()

Unnamed: 0.1,Unnamed: 0,lat,lon,userId,date,text,place_id,hour,minutes,day,weekday,geometry
0,136248,55.764696,37.51426,160030952,1464539888,#Moscow #city @ Moscow City,6653,68400,38,29,6,POINT (37.514260355691 55.764695793995)
1,136877,55.769094,37.511651,30483977,1464506017,Отрабатываю новый конфиг) @ Swap Laboratories,6876,36000,13,29,6,POINT (37.511650971025 55.769094498953)
2,136878,55.769094,37.511651,46902271,1463880480,Активация пешего режима выполнена ✅<br>#дрифтг...,6876,14400,28,22,6,POINT (37.511650971025 55.769094498953)
3,156531,55.765229,37.471997,278967427,1464747833,"С днём рождения, Ночные волки!😊",10285,18000,23,1,2,POINT (37.471997378282 55.765228942948)
4,156532,55.765229,37.471997,2796905,1464736786,#27_лет_НВ,10285,7200,19,31,1,POINT (37.471997378282 55.765228942948)


In [7]:
neigh_posts.shape

(35251, 12)

# Group posts by user

In [8]:
vk_users = pd.DataFrame({'post' : neigh_posts.groupby('userId').apply( lambda x: ' '.join(x['text']))}).reset_index()

# How many users are there?

In [25]:
vk_users

Unnamed: 0,userId,post
0,321,December sun. Here you come. @ Zhivopisny Bridge
1,1271,У хороших людей частенько характеры не сахар. ...
2,1973,:)
3,2639,Ползут по дому. Один с воздушными шариками #Mo...
4,4503,#красивоерядом #москва #живописныймост @ Живоп...
5,6141,"В прошлом году не успели прилететь, в этом - о..."
6,6217,💜🐥💜 мои курочки [id7915541|Надежда Имильбаева]...
7,8006,12 июня - День принятия декларации о государст...
8,8463,"Давно хотела там побывать и вот, время пришло...."
9,9709,"Уставшая, но пятница ж, поехали на кальян) @ ..."


In [26]:
extra_words = ['http','br','id','com','www', 'instagram', 'vsco', 'https', 'instasize','repost',
              'whatsapp', 'вотсап', 'repostapp','маникюр', 'бровь', 'ресница', 'губа', 'instacollage', 'опубликовывать',
                'фото', 'москва', 'moscow']
def process_docs(docs):
    """
    Function to process texts. Following are the steps we take:
    
    1. Text tokenization.
    2. Removing numbers 
    3. Stopword and short words Removal.
    4. Lemmatization and filter words by their length.
    
    Args:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    m = Mystem()
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    stops = stopwords.words('russian') + stopwords.words('english') + extra_words
    
    for idx in tqdm(range(len(docs))):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not any(c.isdigit() for c in token) and ('id' not in token or 'club' not in token or 'ru' not in token)] for doc in tqdm(docs)]
    #Lemmatize words
    docs = [[m.lemmatize(token)[0] for token in doc ] for doc in tqdm(docs)]
    #Remove stopwords
    docs = [[token for token in doc if token not in stops] for doc in tqdm(docs)]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in tqdm(docs)]
    return docs
    
def get_corpus(docs):
    
    """Add bigrams to docs and create corpus and dictionary for training
    
    Args:
        docs: list of tokenized and cleaned texts;
    Returns:
        corpus: list of lists of tuples, where first element of tuple is a word id
        and the second is the count of that word in the whole corpus
        dictionary: gensim.corpora.dictionary.Dictionary 
  
    """
    
    frequency = defaultdict(int)
    for text in tqdm(docs):
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 3] for text in tqdm(docs)]

    #Take the bigram, if token is a bigram, add to document.
    bigram = Phrases(texts, min_count = 20)
    for idx in tqdm(range(len(texts))):
        for token in bigram[texts[idx]]:
            if '_' in token:
                texts[idx].append(token)
    
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(texts)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    #dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in tqdm(texts)]
    
    print('Number of unique tokens: {}'.format(len(dictionary)))
    print('Number of documents: {}'.format(len(corpus)))
    
    return corpus, dictionary

# Process texts to be ready for modeling

In [23]:
texts = vk_users['post'].copy()

In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/korolevalarisa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
docs = process_docs(texts.values)

100%|██████████| 13100/13100 [00:00<00:00, 35702.98it/s]
100%|██████████| 13100/13100 [00:00<00:00, 14781.72it/s]
100%|██████████| 13100/13100 [00:52<00:00, 249.83it/s]
100%|██████████| 13100/13100 [00:02<00:00, 5388.19it/s]
100%|██████████| 13100/13100 [00:00<00:00, 199664.17it/s]


In [28]:
corpus, dictionary = get_corpus(docs)

100%|██████████| 13100/13100 [00:00<00:00, 67880.99it/s]
100%|██████████| 13100/13100 [00:00<00:00, 103777.05it/s]
100%|██████████| 13100/13100 [00:01<00:00, 9987.72it/s] 
100%|██████████| 13100/13100 [00:00<00:00, 35443.56it/s]

Number of unique tokens: 8686
Number of documents: 13100





In [32]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


#The training model - we use online LDA model which allows to update the model 
#and the following parameters should be defined
num_topics = 10 # number of topics
chunksize = 1000 
passes = 10
iterations = 400
eval_every = 10  #evaluate model perplexity.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha=0.001, update_every = 1, \
                       num_topics=num_topics,\
                       eval_every=eval_every, passes = passes)

2017-05-11 13:46:52,432 : INFO : using symmetric eta at 0.00011512779184895233
2017-05-11 13:46:52,435 : INFO : using serial LDA version on this node
2017-05-11 13:46:52,986 : INFO : running online LDA training, 10 topics, 10 passes over the supplied corpus of 13100 documents, updating model once every 1000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2017-05-11 13:46:52,987 : INFO : PROGRESS: pass 0, at document #1000/13100
2017-05-11 13:46:54,114 : INFO : merging changes from 1000 documents into a model of 13100 documents
2017-05-11 13:46:54,198 : INFO : topic #3 (0.001): 0.033*"серебряный" + 0.021*"живописный" + 0.012*"хомячок" + 0.012*"улица" + 0.012*"клетка_хомячок" + 0.011*"клетка" + 0.011*"живописный_улица" + 0.010*"zhivopisny_bridge" + 0.010*"zhivopisny" + 0.010*"bridge"
2017-05-11 13:46:54,200 : INFO : topic #0 (0.001): 0.026*"bridge" + 0.022*"zhivopisny_bridge" + 0.021*"zhivopisny" + 0.014*"улица" + 0.012*"живо

CPU times: user 1min 31s, sys: 1.62 s, total: 1min 32s
Wall time: 1min 33s


In [None]:
# import nltk
# nltk.download()

In [33]:
data = pyLDAvis.gensim.prepare(model, corpus, dictionary); # visualize lda topics
pyLDAvis.display(data)

# Assign topics to users

In [18]:
docTopicProbMat = model[corpus]
lda_users = vk_users.copy()
lda_users['topics'] = docTopicProbMat
vk_users['topic'] = lda_users['topics'].apply(lambda x :x[0][0])

NameError: name 'model' is not defined

# Assign topics to their posts

In [14]:
neigh_posts['topic'] = neigh_posts['userId'].apply(lambda userId: vk_users.loc[vk_users['userId']==userId,'topic'].item())

# Plot the histogram of topic distribution per posts and per users

# Mapping

In [15]:
center_lat = list(district.centroid[0].coords)[0][1]
center_lon = list(district.centroid[0].coords)[0][0]

In [16]:
map_places = folium.Map([center_lat, center_lon], tiles='Stamen Toner', zoom_start=14,control_scale=True)

#Define style for geojson objects
style_function = lambda feature: dict(fillColor='#AECCAE',
                                      color='#AECCAE',
                                      weight=1,
                                      opacity=0.3)

# Adding Houses
houses = gpd.read_file('../Data dive/dd2/{}/{}_chruchevki.geogson'.format(name, name))
points = folium.features.GeoJson(houses,name='Khurshevki houses')
map_places.add_child(points,name='Khurshevki houses')

#Adding district
polygon = folium.features.GeoJson(district, style_function=style_function,name='district boundary')
map_places.add_child(polygon,name='district boundary')

colormap_dict = {0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}

#Adding topics heatmaps
for topic_id in [0]:
    topic_coords = list(zip(neigh_posts[neigh_posts.topic == topic_id].lat, neigh_posts[neigh_posts.topic == topic_id].lon))

    HeatMap(topic_coords,
            name='Topic: {}'.format(topic_id),
            radius=10, 
            min_opacity=0.8,
            gradient={0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}).add_to(map_places)


    colormap = folium.LinearColormap(colors = colormap_dict.values())
    colormap.caption = 'Topic: {}'.format(topic_id)

#Switch between layers
folium.LayerControl().add_to(map_places)
map_places

In [21]:
map_places.save('topics.html')

NameError: name 'map_places' is not defined

In [23]:
map_places = folium.Map([center_lat, center_lon], tiles='Stamen Toner', zoom_start=14,control_scale=True)

#Define style for geojson objects
style_function = lambda feature: dict(fillColor='#AECCAE',
                                      color='#AECCAE',
                                      weight=1,
                                      opacity=0.3)

# Adding Houses
houses = gpd.read_file('../Data dive/dd2/{}/{}_chruchevki.geogson'.format(name, name))
points = folium.features.GeoJson(houses,name='Khurshevki houses')
map_places.add_child(points,name='Khurshevki houses')

#Adding district
polygon = folium.features.GeoJson(district, style_function=style_function,name='district boundary')
map_places.add_child(polygon,name='district boundary')

colormap_dict = {0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}

#Adding topics heatmaps
for topic_id in [4, 5, 6]:
    topic_coords = list(zip(neigh_posts[neigh_posts.topic == topic_id].lat, neigh_posts[neigh_posts.topic == topic_id].lon))

    HeatMap(topic_coords,
            name='Topic: {}'.format(topic_id),
            radius=10, 
            min_opacity=0.8,
            gradient={0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}).add_to(map_places)


    colormap = folium.LinearColormap(colors = colormap_dict.values())
    colormap.caption = 'Topic: {}'.format(topic_id)

#Switch between layers
folium.LayerControl().add_to(map_places)
map_places

In [25]:
map_places = folium.Map([center_lat, center_lon], tiles='Stamen Toner', zoom_start=14,control_scale=True)

#Define style for geojson objects
style_function = lambda feature: dict(fillColor='#AECCAE',
                                      color='#AECCAE',
                                      weight=1,
                                      opacity=0.3)

# Adding Houses
houses = gpd.read_file('../Data dive/dd2/{}/{}_chruchevki.geogson'.format(name, name))
points = folium.features.GeoJson(houses,name='Khurshevki houses')
map_places.add_child(points,name='Khurshevki houses')

#Adding district
polygon = folium.features.GeoJson(district, style_function=style_function,name='district boundary')
map_places.add_child(polygon,name='district boundary')

colormap_dict = {0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}

#Adding topics heatmaps
for topic_id in [7, 8, 9]:
    topic_coords = list(zip(neigh_posts[neigh_posts.topic == topic_id].lat, neigh_posts[neigh_posts.topic == topic_id].lon))

    HeatMap(topic_coords,
            name='Topic: {}'.format(topic_id),
            radius=10, 
            min_opacity=0.8,
            gradient={0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}).add_to(map_places)


    colormap = folium.LinearColormap(colors = colormap_dict.values())
    colormap.caption = 'Topic: {}'.format(topic_id)

#Switch between layers
folium.LayerControl().add_to(map_places)
map_places

In [26]:
map_places = folium.Map([center_lat, center_lon], tiles='Stamen Toner', zoom_start=14,control_scale=True)

#Define style for geojson objects
style_function = lambda feature: dict(fillColor='#AECCAE',
                                      color='#AECCAE',
                                      weight=1,
                                      opacity=0.3)

# Adding Houses
houses = gpd.read_file('../Data dive/dd2/{}/{}_chruchevki.geogson'.format(name, name))
points = folium.features.GeoJson(houses,name='Khurshevki houses')
map_places.add_child(points,name='Khurshevki houses')

#Adding district
polygon = folium.features.GeoJson(district, style_function=style_function,name='district boundary')
map_places.add_child(polygon,name='district boundary')

colormap_dict = {0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}

#Adding topics heatmaps
for topic_id in [9, 10]:
    topic_coords = list(zip(neigh_posts[neigh_posts.topic == topic_id].lat, neigh_posts[neigh_posts.topic == topic_id].lon))

    HeatMap(topic_coords,
            name='Topic: {}'.format(topic_id),
            radius=10, 
            min_opacity=0.8,
            gradient={0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}).add_to(map_places)


    colormap = folium.LinearColormap(colors = colormap_dict.values())
    colormap.caption = 'Topic: {}'.format(topic_id)

#Switch between layers
folium.LayerControl().add_to(map_places)
map_places