In [18]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to C:\Users\Ryan
[nltk_data]     Hoff\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ryan
[nltk_data]     Hoff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
import numpy as np
import pandas as pd
import string
import re

In [20]:
df = pd.read_csv('2018-09-comments with ents.csv')

In [21]:
## Text Pre-proc

### 1. Remove lines where comments says -removed

# get and remove rows where comment has [removed]
mask_removed = df['comments'] == '[removed]'
df = df.loc[~mask_removed,:].reset_index(drop = True)

# get and remove rows where comment has [deleted]
mask_deleted = df['comments'] == '[deleted]'
df = df.loc[~mask_deleted,:].reset_index(drop = True)

### 2. Convert to lower case

# Convert messages to lower case
df['comments'] = df['comments'].map(lambda x: x.lower())

### 3. Replace newline and carriage return characters.

import re

def replace_newline(s):
    s = re.sub('\n', ' ',s)
    s = re.sub("\r", ' ',s)
    return s

df['comments'] = df['comments'].map(replace_newline)

### 4. Replace common english slang with full words

def replace_slang(s):
    '''replace commonly used phrases with the full thing'''
    to_replace = ["i'd", "you'd", "we'd","they'd", "i'll", "you'll", "we'll", "they'll", "i'm",
                  "y'all", "you're", "we're" "they're", "i've", "you've", "we've", "they've", "didn't", "shouldn't",
                 "couldn't", "wouldn't", "won't", "can't", "mustn't", "ain't", "isn't", "that's" "doesn't", "it'd"]
    
    replace_with = ["i would", "you would", "we would", "they would", "i will", "you will", "we will", "they will",
                   "i am", "you all", "you are", "we are", "they are", "i have", "you have", "we have", "they have", 
                    "did not", "should not", "could not", "would not", "will not", "cannot", "must not", "am not", "is not",
                   "that is", "does not", "it would"]
    
    for i in range(len(to_replace)):
        s = re.sub(r"\b%s\b" %to_replace[i], replace_with[i], s)
    
    return s

df['comments'] = df['comments'].map(replace_slang)

### 4. Remove urls

# remove websites from comments
import re

def remove_urls(s):
    s = re.sub("http[^\s]+", " ", s)
    s = re.sub("https[^\s]+", " ", s)
    s = re.sub("[^\s]+\.us[^\s]+", " ", s)
    s = re.sub("[^\s]+\.co.uk[^\s]+", " ", s)
    s = re.sub("[^\s]+\.com[^\s]+", " ", s)
    s = re.sub(" +", " ", s)
    return s

df['comments'] = df['comments'].map(remove_urls)

### 5. Change encoding to utf-8

def encode_decode(s):
    s = s.encode('ascii', errors = 'ignore').decode('utf-8')
    return s

# remove unknown characters
df['comments'] = df['comments'].map(encode_decode)

### 6. Remove apostrophes, stars (*) and extra spaces.

# remove apostrophes
def remove_apostrophe(s):
    s = re.sub("'", '',s)
    s = re.sub("\"", "", s)
    s = re.sub("\*", " ", s)
    s = re.sub(" +", " ", s)
    s = s.rstrip().lstrip()
    return s

# remove apostrophe
df['comments'] = df['comments'].map(remove_apostrophe)

### 7. Some slangs weren't caught last time

def replace_slang_again(s):
    '''replace commonly used phrases with the full thing'''
    to_replace = ["id", "youd", "theyd", "youll", "they'll", "im",
                  "yall", "youre", "theyre", "ive", "youve", "weve", "theyve", "didnt", "shouldnt",
                 "couldnt", "wouldnt", "wont", "cant", "mustnt", "aint", "isnt", "thats", "doesnt", "itd", "ill"]
    
    replace_with = ["i would", "you would", "they would", "you will", "they will",
                   "i am", "you all", "you are", "they are", "i have", "you have", "we have", "they have", 
                    "did not", "should not", "could not", "would not", "will not", "cannot", "must not", "am not",
                   "is not", "that is", "does not", "it would", "i will"]
    
    for i in range(len(to_replace)):
        s = re.sub(r"\b%s\b" %to_replace[i], replace_with[i], s)
    
    return s

df['comments'] = df['comments'].map(replace_slang_again)

### 8. Remove parantheses.

# remove apostrophes
def remove_parantheses(s):
    s = re.sub("\(", ' ',s)
    s = re.sub("\[", ' ',s)
    s = re.sub("]", ' ',s)
    s = re.sub("\|", ' ', s)
    s = re.sub("\)", " ", s)
    s = re.sub(" +", " ", s)
    s = s.rstrip().lstrip()
    return s

# remove apostrophe
df['comments'] = df['comments'].map(remove_parantheses)

### 8. Add space after full stops where it is directly followed by a character other than a space.

# Many comments have full stops not followed by any spaces. Lets correct this
def fullstop_space(s):
    s = re.sub(r'(?<=[.,])(?=[^\s\.[0-9]])', r' ', s)
    return s

df['comments'] = df['comments'].map(fullstop_space)

### 9. Remove empty comments
#These are comments that had just an unknown character in them

mask_empty = df['comments'] == ''
df = df.loc[~mask_empty, :]

### 10. Remove one word comments

def flag_one_word(s):
    if len(s.split(' ')) <= 10:
        return True
    else:
        return False

mask_one_word = df['comments'].map(flag_one_word)

df = df.loc[~mask_one_word, :]
df.drop(df.columns[[0]], axis=1)  
df.to_csv("simple")

In [22]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [23]:
import random
def siege(f):
    text_data = []
    with open(f, encoding="utf8") as f:
        for line in f:
            tokens = prepare_text_for_lda(line)
            if random.random() > .99:
                #print(tokens)
                text_data.append(tokens)
    return text_data

In [24]:
text_data = siege('simple.csv')

In [25]:
from gensim import corpora
import pickle

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [30]:
import gensim
i = 6
Topics_1 =[]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = i, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=100)
for topic in topics:
    Topics_1.append(topic[1])


In [31]:
df = pd.DataFrame({'col':Topics_1})
df.to_csv("Topics_List.csv")

In [60]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
# def prep_frame(df):
#     """Formats dataframe by cutting everything but comment body"""
#     df = df.drop(['header', 'entities_labels', 'Unnamed: 0'], axis=1)
#     df.columns = ['body']
#     df['body'] = df['body'].astype(str)
#     # get and remove rows where comment has [removed]
#     mask_removed = df['body'] == '[removed]'
#     df = df.loc[~mask_removed,:].reset_index(drop = True)
    
#     # get and remove rows where comment has [deleted]
#     mask_deleted = df['body'] == '[deleted]'
#     df = df.loc[~mask_deleted,:].reset_index(drop = True)
    
#     # convert to string 
#     #l = df.values.T.tolist()
#     #s = ' '.join(str(f) for f in l) 
#     return df
# text_data = prep_frame(text_data)
# text_data.head(n=10)




# def prep_string(s):
#     '''Replace commonly used phrases with the full thing and cleave excess'''
#     s = re.sub('\n', ' ',s)
#     s = re.sub("\r", ' ',s)
#     s = re.sub(r'[^\w\s]','',s)
#     to_replace = ["i'd", "you'd", "we'd","they'd", "i'll", "you'll", "we'll", "they'll", "i'm",
#                   "y'all", "you're", "we're" "they're", "i've", "you've", "we've", "they've", "didn't", "shouldn't",
#                  "couldn't", "wouldn't", "won't", "can't", "mustn't", "ain't", "isn't", "that's" "doesn't", "it'd"]
    
#     replace_with = ["i would", "you would", "we would", "they would", "i will", "you will", "we will", "they will",
#                    "i am", "you all", "you are", "we are", "they are", "i have", "you have", "we have", "they have", 
#                     "did not", "should not", "could not", "would not", "will not", "cannot", "must not", "am not", "is not",
#                    "that is", "does not", "it would"]
#     for i in range(len(to_replace)):
#             s = re.sub(r"\b%s\b" %to_replace[i], replace_with[i], s) 
#     return s
#     s = re.sub("'", '',s)
#     s = re.sub("\"", "", s)
#     #s = re.sub("\*", " ", s)
#     s = re.sub(" +", " ", s)
#     s = s.rstrip().lstrip()
#     return s
# def apply_drop(df):
#     '''Applies prep_strings and then replaces the coloumn body'''
#     df['new'] = df['body'].apply(prep_string)
#     df = df.drop(['body'], axis=1)
#     df.columns = ['body']
#     return df
# text_data = apply_drop(text_data).to_csv('simple.csv')