In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk; 
nltk.download('stopwords')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [3]:
%matplotlib inline

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [5]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [7]:
df = pd.read_json('English_data.json')
print(df.Narrative.unique())
df.head()

['I have never experienced awe, I only ever have that effect on other people. I am an awe giver, not a reciever. People drop to there knees completely agog at my awe-somness, I get anything done. It is hard being this cool'
 'a feeling of overwhelmness and a feeling of spiritual sensation, a feeling of calm and serenity and being the only being in the whole universe'
 'A beautiful landscape on a fine day looking across at old vintage planes flying through the air'
 'The day my father was arrested by the police. It was an unpleasant experience. He was an alcoholic. He had verbally threatened to kill my mother repeatedly, I had no choice but to call n the authorities. My brothers were all busy with their own lives, working. In the end, I had to restrain my father to insert a handcuff on him. It was a good lesson.'
 'Graduated from university'
 'Seeing my mother hooked up to tubes after an operation for cancer. My mother had a life of pain, setbacks & hurt. She was born premature in 1931,

 'WHEN I SAW THIS BEAUTIFUL GIRL I WAS ABOUT TO BANG.  I WAS AT HOUSE APARTMENT AND SHE HAD A DECENT FACE AND A PHAT ASS.  AFTER WE TOOK OFF OUR CLOTHES WE BEGIN TO HAVE SEX.  IT WAS ONE OF THE BEST EXPERIENCES OF MY LIFE.']


Unnamed: 0,Unnamed: 1,Age,AgeNow,AgeThen,BirthPlace,Body_X1,Body_X2,Body_X3,Body_X4,Body_X5,...,X.6,X.7,X.8,X.9,country,gc,number,opp,rid,term
0,1,30,30,4,28,74.0,67.0,88.0,85.0,79.0,...,,30,,,UK,1,106,Qual961-0923Countries,1317082082,
1,2,55,55,5,28,92.0,83.0,92.0,294.0,285.0,...,,55,,,UK,1,107,Qual961-0923Countries,1309002101,
2,4,58,58,6,5,290.0,90.0,287.0,287.0,287.0,...,,58,,,Australia,1,110,Qual961-0923Countries,1316625902,
3,5,25,25,2,22,266.0,306.0,113.0,37.0,145.0,...,,25,,,Singapore,1,111,Qual961-0923Countries,1320724474,
4,6,35,35,4,22,88.169998,290.79,82.37,296.05,78.16,...,,35,,,Singapore,1,112,Qual961-0923Countries,1184419179,


In [8]:
# Convert to list
data = df.Narrative.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['I have never experienced awe, I only ever have that effect on other people. '
 'I am an awe giver, not a reciever. People drop to there knees completely '
 'agog at my awe-somness, I get anything done. It is hard being this cool']


In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['have', 'never', 'experienced', 'awe', 'only', 'ever', 'have', 'that', 'effect', 'on', 'other', 'people', 'am', 'an', 'awe', 'giver', 'not', 'reciever', 'people', 'drop', 'to', 'there', 'knees', 'completely', 'agog', 'at', 'my', 'awe', 'somness', 'get', 'anything', 'done', 'it', 'is', 'hard', 'being', 'this', 'cool']]


In [10]:
# Build bigram and trigram
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])



['have', 'never', 'experienced', 'awe', 'only', 'ever', 'have', 'that', 'effect', 'on', 'other', 'people', 'am', 'an', 'awe', 'giver', 'not', 'reciever', 'people', 'drop', 'to', 'there', 'knees', 'completely', 'agog', 'at', 'my', 'awe', 'somness', 'get', 'anything', 'done', 'it', 'is', 'hard', 'being', 'this', 'cool']


In [11]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
#remove stop words
data_words_nostops = remove_stopwords(data_words)

#getting bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


nlp = spacy.load('en', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['never', 'experience', 'awe', 'ever', 'effect', 'people', 'awe', 'giver', 'reciever', 'people', 'drop', 'knee', 'completely', 'agog', 'awe', 'somness', 'get', 'anything', 'do', 'hard', 'cool']]


In [13]:
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 1), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1)]]


In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=False)

In [15]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.043*"rocket" + 0.042*"shoot" + 0.036*"desire" + 0.036*"reaction" + '
  '0.036*"apollo" + 0.034*"officer" + 0.030*"kenny" + 0.030*"oserv" + '
  '0.030*"smallness" + 0.030*"facial"'),
 (2,
  '0.039*"power" + 0.037*"continue" + 0.035*"select" + 0.032*"vacation" + '
  '0.032*"hill" + 0.027*"bone" + 0.021*"approach" + 0.020*"marrow" + '
  '0.019*"harbour" + 0.016*"experienced"'),
 (29,
  '0.081*"alone" + 0.072*"animal" + 0.034*"home" + 0.030*"lady" + '
  '0.025*"asleep" + 0.020*"native" + 0.017*"read" + 0.016*"comment" + '
  '0.016*"language" + 0.014*"computer"'),
 (6,
  '0.131*"dinner" + 0.100*"plan" + 0.054*"bill" + 0.038*"celebrate" + '
  '0.032*"infidelity" + 0.032*"heartfelt" + 0.032*"deed" + 0.023*"city" + '
  '0.019*"airport" + 0.019*"degree"'),
 (7,
  '0.093*"young" + 0.074*"age" + 0.059*"license" + 0.032*"man" + '
  '0.030*"automatic" + 0.030*"alosssssssssssg" + 0.030*"conquer" + '
  '0.030*"transmission" + 0.030*"aster" + 0.026*"nothing"'),
 (23,
  '0.061*"ground" + 0.04

In [21]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [17]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)

        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Index', 'Dominant Topic', 'Topic Contribution', 'Keywords', 'Text']

df_dominant_topic.head(10)

Unnamed: 0,Index,Dominant Topic,Topic Contribution,Keywords,Text
0,0,12.0,0.4866,"land, vet, take, anything, result, manner, sli...","I have never experienced awe, I only ever have..."
1,1,18.0,0.3369,"house, move, may, wedding, clothe, film, decen...",a feeling of overwhelmness and a feeling of sp...
2,2,17.0,0.52,"watch, sit, show, come, plane, dream, get, hop...",A beautiful landscape on a fine day looking ac...
3,3,14.0,0.5822,"father, busy, call, cousin, win, mantle, micke...",The day my father was arrested by the police. ...
4,4,27.0,0.3022,"not, do, be, sister, lot, remember, maybe, leg...",Graduated from university
5,5,34.0,0.5715,"family, school, come, brother, die, mother, wo...",Seeing my mother hooked up to tubes after an o...
6,6,17.0,0.2817,"watch, sit, show, come, plane, dream, get, hop...",i went to Elvis Presley home in memphis.I coul...
7,7,32.0,0.5893,"small, last, surprise, afterwards, scene, summ...",I went to a local slate quarry that has a lago...
8,8,47.0,0.6277,"year, work, hard, bar, complete, get, next, co...",my first job in which I had been in for about ...
9,9,48.0,0.6891,"new, look, never, immediately, stop, right, go...",I was shopping in a garden centre with my moth...


In [18]:
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
   
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

sent_topics_sorteddf_mallet.columns = ['Topic Index', "Topic Contribution", "Keywords", "Text"]

sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic Index,Topic Contribution,Keywords,Text
0,0.0,0.4464,"rocket, shoot, desire, reaction, apollo, offic...",I went to see the Foo Fighter at Milton Keynes...
1,1.0,0.5007,"star, night, tiny, clear, earth, planet, cold,...",As a child I was interested in astronomy. One ...
2,2.0,0.5232,"power, continue, select, vacation, hill, bone,...","I was in Thassos greece, I was approaching it ..."
3,3.0,0.5852,"try, money, even, take, almost, happy, pay, st...",I had resigned from my teaching job due to unf...
4,4.0,0.3597,"talk, receive, vehicle, cancer, awed, sadness,...",I can remember last year when the young lad di...
5,5.0,0.4243,"many, sun, future, rock, ray, photo, pm, cut, ...","Sunrise at Uluru (aka Ayers Rock). Alone, not ..."
6,6.0,0.2592,"dinner, plan, bill, celebrate, infidelity, hea...",i was with my boyfriend at the time he came ov...
7,7.0,0.5679,"young, age, license, man, automatic, alossssss...",when a very young man about my age new the bib...
8,8.0,0.7484,"realize, drive, view, camp, witness, point, va...",A magnificent view of the Wye Valley suddenly ...
9,9.0,0.5822,"amazed, believe, first, slide, anger, accord, ...",It was a few years back when I made my first t...
