In [1]:
# keep a list of specific words/phrases you want to remove so as to focus on topic modelling
# convert Area of Outstanding Natural Beauty to AONB, similar for others

text_to_remove = [
        "National Planning Policy Framework",
        "demonstrate",
        "section",
        "work",
        "intentions",
        "november",
        "adopted",
        "adoption",
        "represent",
        "representing",
        "settlement",
        "NPPF",
        "Planning",
        "Cornwall",
        "local",
        "Plan",
        "Development",
        "policies",
        "contrary",
        "development",
        "Proposal",
        "Application",
        "Policy",
        "Policies",
        "paragraph",
        "paragraphs",
        "permitted",
        "development",
        "area",
        "dwelling",
        "proposed",
        "national",
        "site",
        "house",
        "special",
        "justify",
        "town",
        "constitute",
        "sited",
        "siting",
        "site",
        "guidance",
        "benefit",
        "justification",
        "point",
        "raise",
        "factor",
        "balance",
        "house",
        "identify",
        "fall",
        "land ",
        "provide",
        "circumstances",
        "considerable"
        # land, fall, house, identify
        ]

In [2]:
"""
(0) Load df, trim columns. Remove unwanted record categories (trees, lawfulness etc)
(1) Extract useful data like NPD references and permissions references and add as columns
(1) Preprocess text (remove particular words, remove stopwords, lemmatize)
(2) Create dictionary with an int ID for each remaining word
(3) Filter the dictionary down if desired, removing very rare or very common words
(4) Create BOW for each doc - a list of tuples for each record:
    [[(int ref from the dictionary, word frequency)],[(int ref from the dictionary, word frequency)]]
(5) Train the lda model, providing the dictionary, no of topics, no of passes etc.
(6) Adjust preprocessing steps or variables in (5) to tweak topic model output to meaningful topics
(7) Topic model returns a list of topics, with each topic containing tuples of the word id and the probabilities
that the word will feature in a text of that topic
(8) Create a topic probabilities dataframe, with each topic in columns and values containing probabilities
HOW DOES THIS GET TO A SINGLE PROBABILITY PER RECORD, RATHER THAN PROBABILITIES BY WORD???
(9) Add the topic probabilites to original dataframe


TO DO:

10) Add further columns turning probabilities into boolean values at a given cut off point
11) Add logging feature to track outputs with given variables to improve tweaking performance

"""

'\n(0) Load df, trim columns. Remove unwanted record categories (trees, lawfulness etc)\n(1) Extract useful data like NPD references and permissions references and add as columns\n(1) Preprocess text (remove particular words, remove stopwords, lemmatize)\n(2) Create dictionary with an int ID for each remaining word\n(3) Filter the dictionary down if desired, removing very rare or very common words\n(4) Create BOW for each doc - a list of tuples for each record:\n    [[(int ref from the dictionary, word frequency)],[(int ref from the dictionary, word frequency)]]\n(5) Train the lda model, providing the dictionary, no of topics, no of passes etc.\n(6) Adjust preprocessing steps or variables in (5) to tweak topic model output to meaningful topics\n(7) Topic model returns a list of topics, with each topic containing tuples of the word id and the probabilities\nthat the word will feature in a text of that topic\n(8) Create a topic probabilities dataframe, with each topic in columns and valu

In [3]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
from datetime import date

In [4]:
#import nltk
#nltk.download('wordnet') # might not be needed once run once
#nltk.download('omw-1.4')

In [5]:
# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")

In [6]:
df = og_df.copy()

In [7]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000

In [8]:
def drop_col_by_labels(df,column_names):
    for col in column_names:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

In [9]:
def cut_df_down(df):
    
    #Define attribute columns with data on the permission itself
    info_cols = ['Unnamed: 0', 'Address', 'docfragment', 'file_ext', 'filename', 'Decision', 'Link','FromSearch', 'ToSearch', 'AppTypeFrag']
    
    # Define columns to keep for the NLP work
    keep_cols = ['Ref', 'Description', 'Dev_Type','DecDate', 'RefusalReasons']
    
    # find any other columns not in the above categories to remove - these will be those already manually categorised
    x_train_cols = [col for col in df.columns if col not in info_cols and col not in keep_cols]
    
    # for unsupervised model, remove unneeded cols + manually categorised data
    remove_cols = info_cols + x_train_cols
    
    # remove unwanted columns
    df = drop_col_by_labels(df,remove_cols)
    
    return df

In [10]:
def strip_numbers(text):
    
    # create a list of characters if the character is a letter or a space
    strip_numbers = [char for char in text if char.isalpha() or char == " "]
    
    # join the characters again with 'nothing' - as spaces are included above
    strip_numbers = "".join(strip_numbers)
    
    # return the string
    return strip_numbers

In [11]:
def strip_specific_text(text):
    
    text = text.lower()
    # removes common/unwanted/not valuable text and phrases from string
    
    # for each thing you want to remove from the text
    for phrase in master_list:
        # if the lower case version is in the lower case version of the text, replace it with nothing (delete)
        if phrase.lower() in text.lower():
            #print(phrase, " in text")
            text = text.replace(phrase.lower(), "")
    
    return text

In [12]:
# simple func to return length. Used to apply to a list of values
def leng_func(x):
    return len(x)

In [13]:
def Perm_regex(text):
    
    # Identify permission references
    matchtype = r'(?i)(?:PA)?\d{2}[/|_]\d{5}' # e.g. PA12/12345
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
    
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)

    return matches

In [14]:
def CLP_regex(text):
    matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP) (?:\d{4}\s*-\s*\d{4}|\d{4}?)?'
    #matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP)\s*((\(\d{4}(?:\s*-\s*\d{4})?\))|\d{4}\s*-\s*\d{4}|\d{4})?'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [15]:
def NPD_regex(text):
    matchtype = r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:N(?:eighbourhood)?\s*(?:D(?:evelopment)?\s*)?P(?:lan)?|NDP)\b(?:\s+\([^\)]+\))?(?:\s+\d{4}(?:\s*(?:to|-)\s*\d{4})?)?'    
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [16]:
def saints_rule(text):
    # removes full stop in St. before extracting parish names so it is not read as a full stop
    if 'St. ' in text:
        text.replace('St. ', 'St ')
    return text

In [17]:
def stem(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(text)

In [18]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

In [19]:
def stopwords_and_preprocess(text):
    # creates a list of all words passed in if they are not stopwords or v.sml and returns lemmatized version
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            # specifiy preprocessing steps here - stemming / lemmatization, both
            # result.append(lemmatize_stemming(token))
            # result.append(stemming(token))
            result.append(lemmatize(token))
    return result

In [20]:
def list_all_values_in_col(col):
    # return a set of all the different values (all values represented only once)
    all_list = set(df[col].tolist())
    # turn the set into a list
    all_list = [item for item in all_list]
    
    # from the list that can contain sublists, flatten into single list
    new_list = []
    for item in all_list:
        if item == '':
            continue
        sublist = item.split(',')
        for item in sublist:
            new_list.append(item)
    
    return new_list

In [21]:
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [22]:
# chop the df down to only the bits you will use
df = cut_df_down(df)

In [23]:
# get rid of confusing 'St.' strings
df["RefusalReasons"] = df["RefusalReasons"].apply(saints_rule)
# create a column containing a string of references to NPDs
df['NDPs Referenced'] = df["RefusalReasons"].apply(NPD_regex)
# create a column containing a string of references to CLP
df['CLPs Referenced'] = df["RefusalReasons"].apply(CLP_regex)
# create a column containing a string of permission references
df['Perms Referenced'] = df["RefusalReasons"].apply(Perm_regex)

In [24]:
# from an initial list which could contain lists itself, return a single list containing
# all values in the column, cast to a set to remove duplicates
all_types_list = set(list_all_values_in_col('Dev_Type'))
all_perms_list = set(list_all_values_in_col('Perms Referenced'))
all_NPs_list = set(list_all_values_in_col('NDPs Referenced'))
all_CLPs_list = set(list_all_values_in_col('CLPs Referenced'))

# create a list of all these sets
master_list = [all_perms_list, all_NPs_list, all_CLPs_list]
# for each of the lists of sets, add all the values together, then convert back to a list for sorting
master_list = list(set().union(*master_list))
# add the extracted phrases to remove to the more generic phrases you have manually added on exploration
master_list = text_to_remove + master_list
# sort the values by largest number of characters first (so .replace will be greedy)
master_list.sort(reverse = True, key=leng_func)

In [25]:
# show how many records there are for each development category
df.groupby('Dev_Type')['Dev_Type'].describe().sort_values(by='freq', ascending=False)

Unnamed: 0_level_0,count,unique,top,freq
Dev_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Minor - Dwelling,1048,1,Minor - Dwelling,1048
Householder,257,1,Householder,257
All other minor developments,154,1,All other minor developments,154
CLEUD/CLOPED,144,1,CLEUD/CLOPED,144
Changes of Use,103,1,Changes of Use,103
TPO applications,78,1,TPO applications,78
Smallscale Major Dwellings,66,1,Smallscale Major Dwellings,66
Minor - Dwelling - PIP apps only,64,1,Minor - Dwelling - PIP apps only,64
Listed Building Consent (alter/extend),48,1,Listed Building Consent (alter/extend),48
All other small scale major developments,21,1,All other small scale major developments,21


In [26]:
'''NOT SURE IF I WANT TO DO THIS YET - THIS CREATES A DATAFRAME FOR EVERY CATEGORY IN THE LIST
    ALLOWS YOU TO EXPLORE THE DATA A LITTLE EASIER TO SEE WHAT TO INCLUDE/EXCLUDE'''
df_dict ={}
for item in all_types_list:
    df_dict[item] = df[df['Dev_Type']==item]

In [27]:
#df_dict['Householder']

In [28]:
'''First cut the source data down by exluding categories that don't share common refusal reasons - e.g.
very specific types of development'''

# maybe exclude:
# CLUED - Certificate of Lawful Development - these generally refer to whether the use of something
# is lawful or not, so typically refusal reasons differ here
# TPO/TPA - Tree Protection  - generally refusals are for public amenity reasons
#'Listed Building Consent (alter/extend)' - borderline one because it's to do with alterations/extensions

exclude_cats = ['CLEUD/CLOPED','TPO applications','Not Required - Hedgerow Removal','TCA applications']
# Prior Approval - AF2 type == agricultural stuff?
borderline_cats = ['Listed Building Consent (alter/extend)','Prior Approval - AF2 type','Notification - Rail type']

In [29]:
# filter categories to exclude from the df. '~' negates/gives you the opposite of something
df = df[~df['Dev_Type'].isin(exclude_cats)]

In [30]:
df['cleaned'] = df['RefusalReasons'].apply(strip_specific_text)

In [31]:
df['cleaned'] = df['cleaned'].apply(strip_numbers)

In [32]:
df['cleaned'] = df['cleaned'].apply(stopwords_and_preprocess)

In [33]:
# create a list of lists of the preprocessed docs, where each list is the preprocessed text
processed_docs = df['cleaned'].to_list()

In [34]:
# Create dict from 'processed_docs'; each word and an integer ID. Accessed as usual dictionary[0]
# Later passed to the model for training. Besides the id and word, it also contains frequency info
dictionary = gensim.corpora.Dictionary(processed_docs)

In [35]:
# OPTIONAL STEP Remove very rare and very common words: appearing < 15 times, > 30% of all documents
# keep_n=100000 == only the 100,000 most frequent tokens in the corpus will be kept
#dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n= 100000)

In [36]:
#Create BOW model for each doc i.e for each we create a list of tuples (int_ref, count)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [37]:
"""MANUALLY CHANGE NUMBER OF TOPICS AND PASSES TO REFINE MODEL HERE. WILL BE PASSED TO MODEL IN NEXT STEP"""
num_topics = 7
num_passes = 30

In [38]:
#Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
#id2word is a mapping from word ids (integers) to words (strings)
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, # select no of topics to try and create from samples
                                   id2word = dictionary, # above comment                                 
                                   passes = num_passes, # number of passes the model with make (> passes = more thorough????)
                                   workers = 2) # no of extra processes to use for parallelization. Uses all available cores by default

In [39]:
lemmatize('considerable')

'considerable'

In [40]:
# TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.026*"result" + 0.021*"neighbour" + 0.019*"fail" + 0.017*"form" + 0.017*"impact" + 0.016*"amenity" + 0.016*"design" + 0.015*"reason" + 0.014*"character" + 0.012*"exist"


Topic: 1 
Words: 0.015*"bullet" + 0.014*"guide" + 0.013*"design" + 0.013*"public" + 0.012*"relation" + 0.011*"act" + 0.011*"level" + 0.010*"overlook" + 0.010*"consider" + 0.010*"interestthe"


Topic: 2 
Words: 0.036*"countryside" + 0.029*"service" + 0.029*"facilities" + 0.028*"location" + 0.025*"unsustainable" + 0.024*"private" + 0.020*"sustainable" + 0.017*"open" + 0.017*"vehicle" + 0.015*"rural"


Topic: 3 
Words: 0.028*"build" + 0.027*"harm" + 0.026*"character" + 0.024*"conservation" + 0.021*"heritage" + 0.018*"list" + 0.016*"fail" + 0.016*"design" + 0.015*"consider" + 0.015*"appearance"


Topic: 4 
Words: 0.039*"countryside" + 0.034*"character" + 0.030*"build" + 0.029*"form" + 0.022*"rural" + 0.021*"landscape" + 0.020*"open" + 0.017*"location" + 0.017*"residential" + 0.016*"harm"


Topic: 5 
Word

In [41]:
"""Topic: 0 
Words: 0.026*"result" + 0.021*"neighbour" + 0.019*"fail" + 0.017*"form" + 0.017*"impact" + 0.016*"amenity" + 0.016*"design" + 0.015*"reason" + 0.014*"character" + 0.012*"exist"


Topic: 1 
Words: 0.015*"bullet" + 0.014*"guide" + 0.013*"design" + 0.013*"public" + 0.012*"relation" + 0.011*"act" + 0.011*"level" + 0.010*"overlook" + 0.010*"consider" + 0.010*"interestthe"


Topic: 2 
Words: 0.036*"countryside" + 0.029*"service" + 0.029*"facilities" + 0.028*"location" + 0.025*"unsustainable" + 0.024*"private" + 0.020*"sustainable" + 0.017*"open" + 0.017*"vehicle" + 0.015*"rural"


Topic: 3 
Words: 0.028*"build" + 0.027*"harm" + 0.026*"character" + 0.024*"conservation" + 0.021*"heritage" + 0.018*"list" + 0.016*"fail" + 0.016*"design" + 0.015*"consider" + 0.015*"appearance"


Topic: 4 
Words: 0.039*"countryside" + 0.034*"character" + 0.030*"build" + 0.029*"form" + 0.022*"rural" + 0.021*"landscape" + 0.020*"open" + 0.017*"location" + 0.017*"residential" + 0.016*"harm"


Topic: 5 
Words: 0.033*"access" + 0.020*"flood" + 0.018*"safe" + 0.016*"risk" + 0.016*"fail" + 0.015*"suitable" + 0.014*"conflict" + 0.013*"highway" + 0.012*"visibility" + 0.010*"absence"


Topic: 6 
Words: 0.033*"consider" + 0.018*"house" + 0.015*"conservation" + 0.014*"conflict" + 0.014*"provision" + 0.014*"affordable" + 0.012*"absence" + 0.012*"aim" + 0.012*"impact" + 0.010*"secure"




SyntaxError: EOF while scanning triple-quoted string literal (1945144359.py, line 28)

In [None]:
lemmatize('character')

In [None]:
print('Quit code here - manual data entry required for topic naming')

In [None]:
fgjkgljfdlgjdflgjd

In [None]:
# making sense of the derived topics the model suggests


"""THIS IS A MANUAL EDIT YOU NEED TO MAKE"""


inferred_topics = {
    0: 'AONB',
    1: 'Neighbours and Neighbouring Area',
    2: 'Protected areas, conservation',
    3: 'Conservation',
    4: 'Affordable Housing provision',
    5: 'Impact on neighbours/surrounds',
    6: 'Conservation and heritage',
    7: 'Location, open countryside',
}

In [None]:
# Get the topic probabilities for each document in the corpus
document_topics = [lda_model.get_document_topics(bow) for bow in bow_corpus]

In [None]:
# Initialize a list to contain dictionaries with topic_num, topic probabilities for each document
topic_probs_list = []

# Iterate over the documents in the corpus
for doc in bow_corpus:
    # Get the topic and topic probabilities for the current document
    topic_probs = dict(lda_model.get_document_topics(doc))
    # Add the topic probabilities to the list
    topic_probs_list.append(topic_probs)

# Create a Pandas DataFrame from the list of dictionaries
topic_probs_df = pd.DataFrame(topic_probs_list)

In [None]:
# add the topic probabilities dataframe to the original df by index
df = pd.merge(df, topic_probs_df, left_index=True, right_index=True)

In [None]:
# change the topic numbers in the header to your meaningful topic titles
df = df.rename(columns=inferred_topics)

In [None]:
# get todays date in format (dd_mm_yy)
today = "(" + date.today().strftime("%d_%m_%Y") + ")"

In [None]:
# create filename with variables used in run and the date
filename = "LDA_topics_" + str(num_topics) + "_passes_" + str(num_passes) + today

In [None]:
def export_csv():
    from pathlib import Path
    path = "/Users/GlassShark1/Python/Refusals Data/"
    filepath = Path(path + filename + ".csv") 
    print(filepath)
    #filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath) 

In [None]:
export_csv()

In [None]:
#unseen_document = "The proposed development raises direct conflict with the requirements of adopted policy in the Cornwall Local Plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances, none of which have been identified in this particular case. The proposed development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside, thus not conserving the landscape character and natural beauty of the Area of Outstanding Natural Beauty (AONB) in this location. The site is not a sustainable or accessible location for a new dwelling and the development would therefore be contrary to development plan policy and cause material environmental harm to the rural character of the area, contrary to policies 1, 2, 3, 7, 12, 23 and 27 in the Cornwall Local Plan Strategic Policies 2010 - 2030 "

In [None]:
"""# Data preprocessing step for the unseen document
# bow_vector = list of tuples with number representing a word in the corpus and the count of those words???
bow_vector = dictionary.doc2bow(stopwords_and_preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    # number at the end seems to return n words and the strength of their relationship to topic????
    print("Score: {}\n Topic: {}\n".format(score, lda_model.print_topic(index, 10)))"""

In [None]:
# set up logging files to keep track of model / topics etc

In [None]:
'''Preview BOW for our sample preprocessed document - this is on the 20th record as an example'''
"""
document_num = 0 #think this is just the 20th record?

bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))"""

In [None]:
"""ASK MN: DON'T UNDERSTAND THIS WELL - CAN'T SEE WHERE IT'S APPLIED??

alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is 1/num_topics)

Alpha is the per document topic distribution.

High alpha: Every document has a mixture of all topics(documents appear similar to each other).
Low alpha: Every document has a mixture of very few topics
Eta is the per topic word distribution.

High eta: Each topic has a mixture of most words(topics appear similar to each other).
Low eta: Each topic has a mixture of few words."""

In [None]:
'''Checking dictionary created'''
"""count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 100:
        break"""