In [1]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [None]:
import nltk
nltk.download('wordnet') # might not be needed once run once

In [None]:
nltk.download('omw-1.4')

In [2]:
# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/file.csv")

In [3]:
df = og_df.copy()

In [4]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000

In [5]:
def drop_col_by_labels(df,column_names):
    for col in column_names:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

In [6]:
def cut_df_down(df):
    
    #Define attribute columns with data on the permission itself
    info_cols = ['Unnamed: 0', 'Address', 'docfragment', 'file_ext', 'filename', 'Decision', 'Link','FromSearch', 'ToSearch', 'AppTypeFrag']
    
    # Define columns to keep for the NLP work
    keep_cols = ['Ref', 'Description', 'Dev_Type','DecDate', 'RefusalReasons']
    
    # find any other columns not in the above categories to remove - these will be those already manually categorised
    x_train_cols = [col for col in df.columns if col not in info_cols and col not in keep_cols]
    
    # for unsupervised model, remove unneeded cols + manually categorised data
    remove_cols = info_cols + x_train_cols
    
    # remove unwanted columns
    df = drop_col_by_labels(df,remove_cols)
    
    return df

In [7]:
def strip_numbers(mess):
    
    # create a list of characters if the character is a letter or a space
    strip_numbers = [char for char in mess if char.isalpha() or char == " "]
    
    # join the characters again with 'nothing' - as spaces are included above
    strip_numbers = "".join(strip_numbers)
    
    # return the string
    return strip_numbers

In [8]:
def strip_specific_text(mess):
    
    mess = mess.lower()
    # removes common/unwanted/not valuable text and phrases from string
    # -----------------TO DO-----------------
    # Make this more efficient - apply to whole column rather than by message?
    
    # maybe replace all iterations of cornwall local plan with regex
    text_to_remove = [
        "Cornwall Local Plan Strategic Policies 2010 - 2030",
        "Cornwall Local Plan Strategic Policies 2010-2030",
        "Cornwall Local Plan Strategic Policies",
        "Cornwall Local Plan",
        "National Plan Policy Framework",
        "Neighbourhood Development Plan",
        "National Planning Policy Framework",
        "NPPF",
        "Planning",
        "Cornwall",
        "local",
        "Plan",
        "Development",
        "CLP",
        "policies",
        "contrary",
        "development",
        "Proposal",
        "Application",
        "Policy",
        "Policies",
        "paragraph",
        "paragraphs"
        ]
    
    # for each thing you want to remove from the text
    for phrase in text_to_remove:
        # if the lower case version is in the lower case version of the text, replace it with nothing (delete)
        if phrase.lower() in mess.lower():
            #print(phrase, " in text")
            mess = mess.replace(phrase.lower(), "")
    
    return mess

In [9]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [10]:
# Tokenize and lemmatize

def stopwords(text):
    # creates a list of all words passed in if they are not stopwords or v.sml and returns lemmatized version
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [11]:
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [12]:
# chop the df down to only the bits you will use
df = cut_df_down(df)

In [16]:
# show how many records there are for each development category
df.groupby('Dev_Type')['Dev_Type'].describe().sort_values(by='freq', ascending=False)

Unnamed: 0_level_0,count,unique,top,freq
Dev_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Minor - Dwelling,1048,1,Minor - Dwelling,1048
Householder,257,1,Householder,257
All other minor developments,154,1,All other minor developments,154
CLEUD/CLOPED,144,1,CLEUD/CLOPED,144
Changes of Use,103,1,Changes of Use,103
TPO applications,78,1,TPO applications,78
Smallscale Major Dwellings,66,1,Smallscale Major Dwellings,66
Minor - Dwelling - PIP apps only,64,1,Minor - Dwelling - PIP apps only,64
Listed Building Consent (alter/extend),48,1,Listed Building Consent (alter/extend),48
All other small scale major developments,21,1,All other small scale major developments,21


In [17]:
# return a set of all the different categories (all categories represented only once)
all_types_list = set(df['Dev_Type'].tolist())

In [18]:
# turn the set into a list
all_types_list = [item for item in all_types_list]

In [None]:
'''NOT SURE IF I WANT TO DO THIS YET - THIS CREATES A DATAFRAME FOR EVERY CATEGORY IN THE LIST
    ALLOWS YOU TO EXPLORE THE DATA A LITTLE EASIER TO SEE WHAT TO INCLUDE/EXCLUDE'''
df_dict ={}
for item in all_types_list:
    df_dict[item] = df[df['Dev_Type']==item]

In [None]:
df['cleaned'] = df['RefusalReasons'].apply(strip_numbers)

In [None]:
df['cleaned'] = df['cleaned'].apply(strip_specific_text)

In [None]:
df['cleaned'] = df['cleaned'].apply(stopwords)

In [None]:
df.head()

In [None]:
"""print(WordNetLemmatizer().lemmatize('went', pos = 'v')) # past tense to present tense"""

In [None]:
"""'''
Preview a document after preprocessing
'''
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))"""

In [None]:
processed_docs = []

for cell in df['cleaned']:
    processed_docs.append(cell)

In [None]:
'''Create dict from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
'''Checking dictionary created'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

** Gensim doc2bow **

doc2bow(document)

Convert document (a list of words) into the bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). No further preprocessing is done on the words in document; apply tokenization, stemming etc. before calling this method.

In [None]:
'''
OPTIONAL STEP
Remove very rare and very common words:
- words appearing less than 15 times
- words appearing in more than 30% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.3, keep_n= 100000)

In [None]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
'''Preview BOW for our sample preprocessed document'''

document_num = 20

bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Step 4: Running LDA using Bag of Words
We are going for 10 topics in the document corpus.

** We will be running LDA using all CPU cores to parallelize and speed up model training.**

Some of the parameters we will be tweaking are:

num_topics is the number of requested latent topics to be extracted from the training corpus.

id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.

workers is the number of extra processes to use for parallelization. Uses all available cores by default.

alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is 1/num_topics)

Alpha is the per document topic distribution.

High alpha: Every document has a mixture of all topics(documents appear similar to each other).
Low alpha: Every document has a mixture of very few topics
Eta is the per topic word distribution.

High eta: Each topic has a mixture of most words(topics appear similar to each other).
Low eta: Each topic has a mixture of few words.
** passes ** is the number of training passes through the corpus. For example, if the training corpus has 50,000 documents, chunksize is 10,000, passes is 2, then online training is done in 10 updates:

#1 documents 0-9,999
#2 documents 10,000-19,999
#3 documents 20,000-29,999
#4 documents 30,000-39,999
#5 documents 40,000-49,999
#6 documents 0-9,999
#7 documents 10,000-19,999
#8 documents 20,000-29,999
#9 documents 30,000-39,999
#10 documents 40,000-49,999

In [None]:
'''Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'''
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, # select no of topics to try and create from samples
                                   id2word = dictionary, # use the counts of words to do this?? Check correct??                                 
                                   passes = 50, # number of passes the model with make (> passes = more thorough????)
                                   workers = 2) # use both processing cores

In [None]:
'''For each topic, explore the words occuring in that topic and its relative weight'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Step 6: Testing model on unseen document

In [None]:
'''NOT SURE WHAT THE BELOW CODE ACTUALLY DOES'''

In [None]:
num = 100
unseen_document = newsgroups_test.data[num]
print(unseen_document)

In [None]:
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

In [None]:
print(newsgroups_test.target[num])

#The model correctly classifies the unseen document with 'x'% probability to the X category.