In [1]:
# keep a list of specific words/phrases you want to remove so as to focus on topic modelling
# World Heritage Site (WHS), Special Area of Conservation (SAC), Area of Great Landscape Value


text_to_remove = [
        'The Local Planning Authority has acted positively and proactively in determining this application by identifying matters of concern with this proposal.',
        ' On this occasion, the issues are so fundamental that it is not possible to negotiate a satisfactory way forward due to the harm that has been clearly identified within the reason(s) for refusal.',
        'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
        "National Planning Policy Framework","NPPF",
        "building", "built",
        "special justification",
        "demonstrate",
        "house", "permission", "condition",
        "section",
        "work",
        "intentions",
        "adopted",
        "adoption",
        "represent",
        "representing",
        "settlement",
        "planning",
        "Cornwall",
        "local",
        "plan",
        "development",
        "policies",
        "contrary",
        "proposal",
        "application",
        "policy",
        "policies",
        "paragraph","paragraphs",
        "permitted",
        "development",
        "area",
        "dwelling",
        "proposed",
        "national",
        "house",
        "housing",
        "justify",
        "town",
        "constitute",
        "sited",
        "siting",
        "guidance",
        "benefit",
        "justification",
        "point",
        "raise",
        "factor",
        "balance",
        "house",
        "identify",
        "fall",
        "land ",
        "provide",
        "circumstances",
        "considerable",
        "tree",
        "resulting",
        "site", 
        "result", 
        "reason",
        "Caradon", "consider", "conflict", 'aim', 'affect', 'impact'
        #"form" can't have form because of "information"
        ]

In [2]:
# sort the list so it is greedy when it later removes entries and doesn't take partial words
text_to_remove = sorted(text_to_remove, key=len, reverse = True)

In [3]:
"""
(0) Load df, trim columns. Remove unwanted record categories (trees, lawfulness etc)
(1) Extract useful data like NPD references and permissions references and add as columns
(1) Preprocess text (remove particular words, remove stopwords, lemmatize)
(2) Create dictionary with an int ID for each remaining word
(3) Filter the dictionary down if desired, removing very rare or very common words
(4) Create BOW for each doc - a list of tuples for each record:
    [[(int ref from the dictionary, word frequency)],[(int ref from the dictionary, word frequency)]]
(5) Train the lda model, providing the dictionary, no of topics, no of passes etc.
(6) Adjust preprocessing steps or variables in (5) to tweak topic model output to meaningful topics
(7) Topic model returns a list of topics, with each topic containing tuples of the word id and the probabilities
that the word will feature in a text of that topic
(8) Create a topic probabilities dataframe, with each topic in columns and values containing probabilities
HOW DOES THIS GET TO A SINGLE PROBABILITY PER RECORD, RATHER THAN PROBABILITIES BY WORD???
(9) Add the topic probabilites to original dataframe


TO DO:

10) Add further columns turning probabilities into boolean values at a given cut off point
11) Add logging feature to track outputs with given variables to improve tweaking performance

"""

'\n(0) Load df, trim columns. Remove unwanted record categories (trees, lawfulness etc)\n(1) Extract useful data like NPD references and permissions references and add as columns\n(1) Preprocess text (remove particular words, remove stopwords, lemmatize)\n(2) Create dictionary with an int ID for each remaining word\n(3) Filter the dictionary down if desired, removing very rare or very common words\n(4) Create BOW for each doc - a list of tuples for each record:\n    [[(int ref from the dictionary, word frequency)],[(int ref from the dictionary, word frequency)]]\n(5) Train the lda model, providing the dictionary, no of topics, no of passes etc.\n(6) Adjust preprocessing steps or variables in (5) to tweak topic model output to meaningful topics\n(7) Topic model returns a list of topics, with each topic containing tuples of the word id and the probabilities\nthat the word will feature in a text of that topic\n(8) Create a topic probabilities dataframe, with each topic in columns and valu

In [4]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
from datetime import date

In [5]:
#import nltk
#nltk.download('wordnet') # might not be needed once run once
#nltk.download('omw-1.4')

In [6]:
# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")

In [7]:
df = og_df.copy()

In [8]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000

In [9]:
def drop_col_by_labels(df,column_names):
    for col in column_names:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

In [10]:
def cut_df_down(df):
    
    #Define attribute columns with data on the permission itself
    info_cols = ['Unnamed: 0', 'Address', 'docfragment', 'file_ext', 'filename', 'Decision', 'Link','FromSearch', 'ToSearch', 'AppTypeFrag']
    
    # Define columns to keep for the NLP work
    keep_cols = ['Ref', 'Description', 'Dev_Type','DecDate', 'RefusalReasons']
    
    # find any other columns not in the above categories to remove - these will be those already manually categorised
    x_train_cols = [col for col in df.columns if col not in info_cols and col not in keep_cols]
    
    # for unsupervised model, remove unneeded cols + manually categorised data
    remove_cols = info_cols + x_train_cols
    
    # remove unwanted columns
    df = drop_col_by_labels(df,remove_cols)
    
    return df

In [11]:
def strip_numbers(text):
    
    # create a list of characters if the character is a letter or a space
    strip_numbers = [char for char in text if char.isalpha() or char == " "]
    
    # join the characters again with 'nothing' - as spaces are included above
    strip_numbers = "".join(strip_numbers)
    
    # return the string
    return strip_numbers

In [12]:
def strip_specific_text(text):
    
    text = text.lower()
    # removes common/unwanted/not valuable text and phrases from string
    
    # for each thing you want to remove from the text
    for phrase in master_list:
        # if the lower case version is in the lower case version of the text, replace it with nothing (delete)
        if phrase.lower() in text.lower():
            #print(phrase, " in text")
            text = text.replace(phrase.lower(), "")
    
    return text

In [13]:
# simple func to return length. Used to apply to a list of values
def leng_func(x):
    return len(x)

In [14]:
def Perm_regex(text):
    
    # Identify permission references
    matchtype = r'(?i)(?:PA)?\d{2}[/|_]\d{5}' # e.g. PA12/12345
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
    
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)

    return matches

In [15]:
def CLP_regex(text):
    matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP) (?:\d{4}\s*-\s*\d{4}|\d{4}?)?'
    #matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP)\s*((\(\d{4}(?:\s*-\s*\d{4})?\))|\d{4}\s*-\s*\d{4}|\d{4})?'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [16]:
def AONB_regex(text):
    matchtype = r'(?i)\barea\s*of\s*outstanding\s*natural\s*beauty\b|\bAONB\b|\(AONB\)'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [17]:
def NPD_regex(text):
    matchtype = r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:N(?:eighbourhood)?\s*(?:D(?:evelopment)?\s*)?P(?:lan)?|NDP)\b(?:\s+\([^\)]+\))?(?:\s+\d{4}(?:\s*(?:to|-)\s*\d{4})?)?'    
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [18]:
def saints_rule(text):
    # removes full stop in St. before extracting parish names so it is not read as a full stop
    if 'St. ' in text:
        text.replace('St. ', 'St ')
    #specific case where the word 'form' skews results, but we don't want to lose 'information'
    if ' form ' in text:
        text.replace( ' form ', ' ')
    return text

In [19]:
def stem(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(text)

In [20]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

In [21]:
def stopwords_and_preprocess(text):
    # creates a list of all words passed in if they are not stopwords or v.sml and returns lemmatized version
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            # specifiy preprocessing steps here - stemming / lemmatization, both
            # result.append(lemmatize_stemming(token))
            # result.append(stemming(token))
            result.append(lemmatize(token))
    return result

In [22]:
def list_all_values_in_col(col):
    # return a set of all the different values (all values represented only once)
    all_list = set(df[col].tolist())
    # turn the set into a list
    all_list = [item for item in all_list]
    
    # from the list that can contain sublists, flatten into single list
    new_list = []
    for item in all_list:
        if item == '':
            continue
        sublist = item.split(',')
        for item in sublist:
            new_list.append(item)
    
    return new_list

In [23]:
# function that takes a set of values to be replaced with a single value
def replace_text(text, string_set, returnstr):
    for string in string_set:
        text = text.replace(string, returnstr)
    return text

In [24]:
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [25]:
# chop the df down to only the bits you will use
df = cut_df_down(df)

In [26]:
# create a lemmatized version of the list
text_to_remove2 = [lemmatize(word) for word in text_to_remove]

In [27]:
# get rid of confusing 'St.' strings
df["RefusalReasons"] = df["RefusalReasons"].apply(saints_rule)
# create a column containing a string of references to NPDs
df['NDPs Referenced'] = df["RefusalReasons"].apply(NPD_regex)
# create a column containing a string of references to CLP
df['CLPs Referenced'] = df["RefusalReasons"].apply(CLP_regex)
# create a column containing a string of permission references
df['Perms Referenced'] = df["RefusalReasons"].apply(Perm_regex)
# create a column containing a string of AONB references
df['AONB Referenced'] = df["RefusalReasons"].apply(AONB_regex)

In [28]:
# from an initial list which could contain lists itself, return a single list containing
# all values in the column, cast to a set to remove duplicates
all_types_list = set(list_all_values_in_col('Dev_Type'))
all_perms_list = set(list_all_values_in_col('Perms Referenced'))
all_NPs_list = set(list_all_values_in_col('NDPs Referenced'))
all_CLPs_list = set(list_all_values_in_col('CLPs Referenced'))
all_AONB_list = set(list_all_values_in_col('AONB Referenced'))

# create a list of all these sets bar Dev_Type (kept) and AONB (cleaned)
master_list = [all_perms_list, all_NPs_list, all_CLPs_list]
# for each of the lists of sets, add all the values together, then convert back to a list for sorting
master_list = list(set().union(*master_list))
# add the extracted phrases to remove to the more generic phrases you have manually added on exploration
master_list = text_to_remove + text_to_remove2 + master_list
# sort the values by largest number of characters first (so .replace will be greedy)
master_list.sort(reverse = True, key=leng_func)

In [29]:
# Tidy the various AONB references to return only 'AONB'
# replace null values with empty string
df['AONB Referenced'].fillna('', inplace=True)
# update non-null values with 'AONB'
df.loc[df['AONB Referenced'] != '', 'AONB Referenced'] = 'AONB'

In [30]:
# replace all iterations of AONB with 'AONB' in the main data
df['RefusalReasons'] = df['RefusalReasons'].apply(replace_text, args=(all_AONB_list, 'AONB'))

In [31]:
# show how many records there are for each development category
df.groupby('Dev_Type')['Dev_Type'].describe().sort_values(by='freq', ascending=False)

Unnamed: 0_level_0,count,unique,top,freq
Dev_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Minor - Dwelling,1048,1,Minor - Dwelling,1048
Householder,257,1,Householder,257
All other minor developments,154,1,All other minor developments,154
CLEUD/CLOPED,144,1,CLEUD/CLOPED,144
Changes of Use,103,1,Changes of Use,103
TPO applications,78,1,TPO applications,78
Smallscale Major Dwellings,66,1,Smallscale Major Dwellings,66
Minor - Dwelling - PIP apps only,64,1,Minor - Dwelling - PIP apps only,64
Listed Building Consent (alter/extend),48,1,Listed Building Consent (alter/extend),48
All other small scale major developments,21,1,All other small scale major developments,21


In [32]:
'''NOT SURE IF I WANT TO DO THIS YET - THIS CREATES A DATAFRAME FOR EVERY CATEGORY IN THE LIST
    ALLOWS YOU TO EXPLORE THE DATA A LITTLE EASIER TO SEE WHAT TO INCLUDE/EXCLUDE'''
df_dict ={}
for item in all_types_list:
    df_dict[item] = df[df['Dev_Type']==item]

In [33]:
'''First cut the source data down by exluding categories that don't share common refusal reasons - e.g.
very specific types of development'''

# maybe exclude:
# CLUED - Certificate of Lawful Development - these generally refer to whether the use of something
# is lawful or not, so typically refusal reasons differ here
# TPO/TPA - Tree Protection  - generally refusals are for public amenity reasons
#'Listed Building Consent (alter/extend)' - borderline one because it's to do with alterations/extensions

exclude_cats = ['CLEUD/CLOPED','TPO applications','Not Required - Hedgerow Removal','TCA applications']
# Prior Approval - AF2 type == agricultural stuff?
borderline_cats = ['Listed Building Consent (alter/extend)','Prior Approval - AF2 type','Notification - Rail type']

In [34]:
# filter categories to exclude from the df. '~' negates/gives you the opposite of something
df = df[~df['Dev_Type'].isin(exclude_cats)]

In [35]:
# as you've removed entries from the index, they will be out of order, reset to fix
# also required when applying boolean masks later, or when merging on results by index later
df = df.reset_index()

In [36]:
# intital text stripping, including phrases as opposed words and common sentences
# plus stripping of perm references etc, plus stripping of lemmatized versions of text_to_remove
df['cleaned'] = df['RefusalReasons'].apply(strip_specific_text)

In [37]:
df['cleaned'] = df['cleaned'].apply(strip_numbers)

In [38]:
df['cleaned'] = df['cleaned'].apply(stopwords_and_preprocess)

In [39]:
# 'cleaned' is an object datatype, cast as str to allow you to search for substrings for refinement
df['checking col'] = df['cleaned'].astype(str)

In [40]:
# allows you to investigate word fragements and oddities to clean them up in future iterations
#df_topic_dict[5]['cleaned'] = df_topic_dict[5]['cleaned'].astype(str)
#df_topic_dict[5].columns
#df_topic_dict[5][df_topic_dict[5]['cleaned'].str.contains('ination')]
#df[df['checking col'].str.contains('beauty')]
#df[~df['test'].isnull()]

In [41]:
# create a list of lists of the preprocessed docs, where each list is the preprocessed text
processed_docs = df['cleaned'].to_list()

In [42]:
# Create dict from 'processed_docs'; each word and an integer ID. Accessed as usual dictionary[0]
# Later passed to the model for training. Besides the id and word, it also contains frequency info
dictionary = gensim.corpora.Dictionary(processed_docs)

In [43]:
# OPTIONAL STEP Remove very rare and very common words: appearing < 15 times, > 30% of all documents
# keep_n=100000 == only the 100,000 most frequent tokens in the corpus will be kept
#dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n= 100000)

In [44]:
#Create BOW model for each doc i.e for each we create a list of tuples (int_ref, count)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [45]:
"""MANUALLY CHANGE NUMBER OF TOPICS AND PASSES TO REFINE MODEL HERE. WILL BE PASSED TO MODEL IN NEXT STEP"""
num_topics = 8
num_passes = 500
"""
Expected topics might be something like:

AONB,
Character, Location, Visual Harm, Appearance
Flooding/Drainage
Access/Parking
WHS, Historic Environment, Listed Buildings
Conservation/Protected Areas
Design, Scale, overdevelopment
Nieghbouring property and amenity

"""

'\nExpected topics might be something like:\n\nAONB,\nCharacter, Location, Visual Harm, Appearance\nFlooding/Drainage\nAccess/Parking\nWHS, Historic Environment, Listed Buildings\nConservation/Protected Areas\nDesign, Scale, overdevelopment\nNieghbouring property and amenity\n\n'

In [101]:
def train_model():
    #Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
    #id2word is a mapping from word ids (integers) to words (strings)
    lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics = num_topics, # select no of topics to try and create from samples
                                       id2word = dictionary, # above comment                                 
                                       passes = num_passes, # number of passes the model with make (> passes = more thorough????)
                                       workers = 2) # no of extra processes to use for parallelization. Uses all available cores by default
    return lda_model

In [85]:
def make_topics_df():
    topic_words = {}
    for topic_num in range(num_topics):
        words_probs = lda_model.show_topic(topic_num, topn=10)
        words = [word for word, _ in words_probs]
        probs = [prob for _, prob in words_probs]
        topic_words[f"Topic {topic_num + 1}"] = words + probs

    topics_df = pd.DataFrame(topic_words)
    
    return topics_df

In [None]:
# just run this if you know the num_topics and num_passes you want to use

#lda_model = train_model()
#topics_df = make_topics_df()

In [98]:
# create a dictionary to house iterations of the model and it's various passes, topic numbers and top 10 words
topics_dict = {}

In [102]:
# for passes ranging from 50 to 450, adding 100 passes each time
#for i in range(50, 450, 100):
for i in range(50,150,50):
    num_passes = i
    # for a number of topics ranging from 5 to 10
    for b in range(5,11):
        num_topics = b
        print("Passes: {}, Topics: {}".format(num_passes, num_topics))
        scenario = "P_" + str(num_passes) + "_T_" + str(num_topics)
        lda_model = train_model()
        topics_dict[scenario] = make_topics_df()

Passes: 50, Topics: 5
Passes: 50, Topics: 6
Passes: 50, Topics: 7
Passes: 50, Topics: 8
Passes: 50, Topics: 9
Passes: 50, Topics: 10
Passes: 100, Topics: 5
Passes: 100, Topics: 6
Passes: 100, Topics: 7
Passes: 100, Topics: 8
Passes: 100, Topics: 9
Passes: 100, Topics: 10


In [106]:
#topics_dict['P_100T_7']

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7
0,form,conservation,need,property,countryside,aonb,access
1,character,list,affordable,neighbour,character,heritage,absence
2,design,harm,community,overlook,form,landscape,suitable
3,fail,character,market,design,location,harm,safe
4,neighbour,fail,business,noise,rural,character,flood
5,amenity,appearance,fail,occupiers,open,world,fail
6,scale,public,open,increase,residential,management,risk
7,exist,preserve,agricultural,balcony,harm,fail,highway
8,appearance,historic,evidence,guide,special,conserve,provision
9,mass,set,purpose,disturbance,landscape,set,visibility


In [105]:
# get todays date in format (dd_mm_yy)
today = "(" + date.today().strftime("%d_%m_%Y") + ")"
# create filename with variables used in run and the date
filename = "LDA_topics_" + str(num_topics) + "_passes_" + str(num_passes) + today

In [109]:
# create an Excel writer object
writer = pd.ExcelWriter(filename + '.xlsx', engine='xlsxwriter')

# loop over the dictionary and write each dataframe to a separate worksheet
for sheet_name, df in topics_dict.items():
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    
writer.save()

In [66]:
# TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights
for idx, topic in lda_model.print_topics(-1):
    df[topic] = topic
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.071*"access" + 0.028*"safe" + 0.027*"highway" + 0.023*"park" + 0.022*"visibility" + 0.022*"suitable" + 0.019*"vehicles" + 0.018*"road" + 0.016*"fail" + 0.015*"increase"


Topic: 1 
Words: 0.059*"countryside" + 0.039*"form" + 0.030*"special" + 0.028*"character" + 0.026*"absence" + 0.025*"location" + 0.025*"open" + 0.019*"residential" + 0.019*"rural" + 0.017*"unsustainable"


Topic: 2 
Words: 0.020*"need" + 0.017*"affordable" + 0.013*"delivery" + 0.012*"fail" + 0.011*"strategic" + 0.010*"scale" + 0.010*"open" + 0.009*"absence" + 0.009*"suitable" + 0.008*"adverse"


Topic: 3 
Words: 0.030*"character" + 0.028*"design" + 0.022*"fail" + 0.019*"conservation" + 0.019*"appearance" + 0.017*"form" + 0.016*"harm" + 0.015*"neighbour" + 0.013*"scale" + 0.012*"list"


Topic: 4 
Words: 0.020*"provision" + 0.016*"absence" + 0.016*"fail" + 0.015*"secure" + 0.014*"species" + 0.013*"space" + 0.012*"protect" + 0.011*"conservation" + 0.011*"future" + 0.011*"mechanism"


Topic: 5 
Words: 0

In [103]:
#topics_df.sort_values(by=[5],ascending=False)

In [64]:
# TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.071*"access" + 0.028*"safe" + 0.027*"highway" + 0.023*"park" + 0.022*"visibility" + 0.022*"suitable" + 0.019*"vehicles" + 0.018*"road" + 0.016*"fail" + 0.015*"increase"


Topic: 1 
Words: 0.059*"countryside" + 0.039*"form" + 0.030*"special" + 0.028*"character" + 0.026*"absence" + 0.025*"location" + 0.025*"open" + 0.019*"residential" + 0.019*"rural" + 0.017*"unsustainable"


Topic: 2 
Words: 0.020*"need" + 0.017*"affordable" + 0.013*"delivery" + 0.012*"fail" + 0.011*"strategic" + 0.010*"scale" + 0.010*"open" + 0.009*"absence" + 0.009*"suitable" + 0.008*"adverse"


Topic: 3 
Words: 0.030*"character" + 0.028*"design" + 0.022*"fail" + 0.019*"conservation" + 0.019*"appearance" + 0.017*"form" + 0.016*"harm" + 0.015*"neighbour" + 0.013*"scale" + 0.012*"list"


Topic: 4 
Words: 0.020*"provision" + 0.016*"absence" + 0.016*"fail" + 0.015*"secure" + 0.014*"species" + 0.013*"space" + 0.012*"protect" + 0.011*"conservation" + 0.011*"future" + 0.011*"mechanism"


Topic: 5 
Words: 0

In [48]:
def seed_lda_multi():
    
    # THIS VERSION ALLOWS YOU TO WEIGHT SEED WORDS FOR EACH TOPIC TO HELP TO PROCESS
    # Define the seed word lists for each topic
    seed_words_topic1 = ['flood', 'drainage', 'risk']
    seed_words_topic2 = ['access', 'safe', 'highway','visability', 'vehicle']
    seed_words_topic3 = ['countryside', 'open', 'rural', 'location','aonb']
    seed_words_topic4 = ['heritage', 'world', 'historic']
    seed_words_topic5 = ['conservation', 'special', 'enhance','protect']
    seed_words_topic6 = ['neighbour', 'amenities', 'overlook', 'overbear']
    seed_words_topic7 = ['design', 'scale', 'mass']
    # Define more seed word lists for other topics if needed

    num_topics = 7

    # Train the LDA model using the seed words for each topic
    lda_model = gensim.models.LdaModel(bow_corpus, num_topics=num_topics,
                                       id2word=dictionary,
                                       passes=10,
                                       alpha=[0.01]*num_topics,
                                       eta=[[0.8 if word in seed_words_topic1 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic2 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic3 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic4 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic5 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic6 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic7 else 0.001 for word in dictionary.token2id]])

    # Print the topics with their top 10 most important words
    for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
        print('Topic: {} \nWords: {}'.format(idx, topic))
        print('\n')
        
    return lda_model

" THIS VERSION ALLOWS YOU TO WEIGHT SEED WORDS FOR EACH TOPIC TO HELP TO PROCESS\n# Define the seed word lists for each topic\nseed_words_topic1 = ['flood', 'drainage', 'risk']\nseed_words_topic2 = ['access', 'safe', 'highway','visability', 'vehicle']\nseed_words_topic3 = ['countryside', 'open', 'rural', 'location','aonb']\nseed_words_topic4 = ['heritage', 'world', 'historic']\nseed_words_topic5 = ['conservation', 'special', 'enhance','protect']\nseed_words_topic6 = ['neighbour', 'amenities', 'overlook', 'overbear']\nseed_words_topic7 = ['design', 'scale', 'mass']\n# Define more seed word lists for other topics if needed\n\nnum_topics = 7\n\n# Train the LDA model using the seed words for each topic\nlda_model = gensim.models.LdaModel(bow_corpus, num_topics=num_topics,\n                                   id2word=dictionary,\n                                   passes=10,\n                                   alpha=[0.01]*num_topics,\n                                   eta=[[0.8 if word in 

In [104]:
def seed_lda_single()

    # THIS VERSION ALLOWS YOU TO PROVIDE A LIST OF SEED WORDS TO WEIGHT MORE HIGHLY
    # Specify seed topics that are particularly important in assigning topics
    seed_topic = ['aonb','flood','heritage','conservation','neighbour','countryside', 'affordable']

    # Set the weight of the above seed topics to be high as default - 0.5
    for word in seed_topic:
        for i in range(lda_model.num_topics):
            lda_model.get_topic_terms(i)
            if word in dict(lda_model.get_topic_terms(i)):
                lda_model.get_topic_terms(i)[dict(lda_model.get_topic_terms(i)).get(word)][1] = 1

    # re-train the LDA model with the updated topic distributions
    lda_model.update(bow_corpus)

    # TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights
    for idx, topic in lda_model.print_topics(-1):
        print("Topic: {} \nWords: {}".format(idx, topic))
        print("\n")
    
    return lda_model

SyntaxError: invalid syntax (2968104820.py, line 1)

In [50]:
"""# making sense of the derived topics the model suggests


"""THIS IS A MANUAL EDIT YOU NEED TO MAKE"""


inferred_topics = {
    0: 'Access',
    1: 'Conservation/Heritage/Listed',
    2: 'Unknown',
    3: 'Nieghbours',
    4: 'Countryside',
    5: 'Character,landscape, AONB',
    6: 'Flood/Drainage'
}"""

SyntaxError: invalid syntax (1980341942.py, line 4)

In [None]:
print('Quit code here - manual data entry required for topic naming')

In [None]:
fsds

In [None]:
# Get the topic probabilities for each document in the corpus
document_topics = [lda_model.get_document_topics(bow) for bow in bow_corpus]

In [None]:
# Initialize a list to contain dictionaries with topic_num, topic probabilities for each document
topic_probs_list = []

# Iterate over the documents in the corpus
for doc in bow_corpus:
    # Get the topic and topic probabilities for the current document
    topic_probs = dict(lda_model.get_document_topics(doc))
    # Add the topic probabilities to the list
    topic_probs_list.append(topic_probs)

# Create a Pandas DataFrame from the list of dictionaries
topic_probs_df = pd.DataFrame(topic_probs_list)

In [None]:
# add the topic probabilities dataframe to the original df by index
df = pd.merge(df, topic_probs_df, left_index=True, right_index=True)

In [None]:
# create dictionary of dfs where each df has that topic flag to quickly evaluate the groups
df_topic_dict ={}

In [None]:
for i in range(0,num_topics):
    # drop other columns to make it easier to evaulate
    rem_cols = ['Ref','Description','DecDate','NDPs Referenced','CLPs Referenced','Perms Referenced','AONB Referenced','checking col']
    # sort each topic df by it's topic number (highest probabilities first), after first removing nulls
    df_topic_dict[i] = df[~df[i].isnull()].sort_values(by=[i], ascending=False)
    print(len(df_topic_dict[i]))
    
    # add other topic columns to the list to be removed to make it easier to see
    for col in df_topic_dict[i].columns:
        if df_topic_dict[i][col].dtype in ['int64', 'float64'] and col != i and col !='index':
            rem_cols.append(col)
            df_topic_dict[i]= drop_col_by_labels(df_topic_dict[i],rem_cols)

In [None]:
"""# change the topic numbers in the header to your meaningful topic titles
df = df.rename(columns=inferred_topics)"""

In [None]:
def export_csv():
    from pathlib import Path
    path = "/Users/GlassShark1/Python/Refusals Data/"
    filepath = Path(path + filename + ".csv") 
    print(filepath)
    #filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath) 

In [None]:
export_csv()

In [46]:
""" CORE MODEL VERSION
#Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
#id2word is a mapping from word ids (integers) to words (strings)
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, # select no of topics to try and create from samples
                                   id2word = dictionary, # above comment                                 
                                   passes = num_passes, # number of passes the model with make (> passes = more thorough????)
                                   workers = 2) # no of extra processes to use for parallelization. Uses all available cores by default
                                   """

In [None]:
#unseen_document = "The proposed development raises direct conflict with the requirements of adopted policy in the Cornwall Local Plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances, none of which have been identified in this particular case. The proposed development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside, thus not conserving the landscape character and natural beauty of the Area of Outstanding Natural Beauty (AONB) in this location. The site is not a sustainable or accessible location for a new dwelling and the development would therefore be contrary to development plan policy and cause material environmental harm to the rural character of the area, contrary to policies 1, 2, 3, 7, 12, 23 and 27 in the Cornwall Local Plan Strategic Policies 2010 - 2030 "

In [None]:
"""# Data preprocessing step for the unseen document
# bow_vector = list of tuples with number representing a word in the corpus and the count of those words???
bow_vector = dictionary.doc2bow(stopwords_and_preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    # number at the end seems to return n words and the strength of their relationship to topic????
    print("Score: {}\n Topic: {}\n".format(score, lda_model.print_topic(index, 10)))"""

In [None]:
# set up logging files to keep track of model / topics etc

In [None]:
'''Preview BOW for our sample preprocessed document - this is on the 20th record as an example'''
"""
document_num = 0 #think this is just the 20th record?

bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))"""

In [None]:
"""ASK MN: DON'T UNDERSTAND THIS WELL - CAN'T SEE WHERE IT'S APPLIED??

alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is 1/num_topics)

Alpha is the per document topic distribution.

High alpha: Every document has a mixture of all topics(documents appear similar to each other).
Low alpha: Every document has a mixture of very few topics
Eta is the per topic word distribution.

High eta: Each topic has a mixture of most words(topics appear similar to each other).
Low eta: Each topic has a mixture of few words."""

In [None]:
'''Checking dictionary created'''
"""count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 100:
        break"""