In [1]:
# keep a list of specific words/phrases you want to remove so as to focus on topic modelling
# World Heritage Site (WHS), Special Area of Conservation (SAC), Area of Great Landscape Value


text_to_remove = [
        'The Local Planning Authority has acted positively and proactively in determining this application by identifying matters of concern with this proposal.',
        ' On this occasion, the issues are so fundamental that it is not possible to negotiate a satisfactory way forward due to the harm that has been clearly identified within the reason(s) for refusal.',
        'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
        "National Planning Policy Framework","NPPF",
        "building", "built",
        "special justification",
        "demonstrate",
        "house", "permission", "condition",
        "section",
        "work",
        "intentions",
        "adopted",
        "adoption",
        "represent",
        "representing",
        "settlement",
        "planning",
        "Cornwall",
        "local",
        "plan",
        "development",
        "policies",
        "contrary",
        "proposal",
        "application",
        "policy",
        "policies",
        "paragraph","paragraphs",
        "permitted",
        "development",
        "area",
        "dwelling",
        "proposed",
        "national",
        "house",
        "housing",
        "justify",
        "town",
        "constitute",
        "sited",
        "siting",
        "guidance",
        "benefit",
        "justification",
        "point",
        "raise",
        "factor",
        "balance",
        "house",
        "identify",
        "fall",
        "land ",
        "provide",
        "circumstances",
        "considerable",
        "tree",
        "resulting",
        "site", 
        "result", 
        "reason",
        "Caradon", "consider", "conflict", 'aim', 'affect', 'impact', 'information','form'
        #"form" can't have form because of "information"
        ]

In [2]:
# sort the list so it is greedy when it later removes entries and doesn't take partial words
text_to_remove = sorted(text_to_remove, key=len, reverse = True)

In [3]:
"""
(0) Load df, trim columns. Remove unwanted record categories (trees, lawfulness etc)
(1) Extract useful data like NPD references and permissions references and add as columns
(1) Preprocess text (remove particular words, remove stopwords, lemmatize)
(2) Create dictionary with an int ID for each remaining word
(3) Filter the dictionary down if desired, removing very rare or very common words
(4) Create BOW for each doc - a list of tuples for each record:
    [[(int ref from the dictionary, word frequency)],[(int ref from the dictionary, word frequency)]]
(5) Train the lda model, providing the dictionary, no of topics, no of passes etc.
(6) Adjust preprocessing steps or variables in (5) to tweak topic model output to meaningful topics
(7) Topic model returns a list of topics, with each topic containing tuples of the word id and the probabilities
that the word will feature in a text of that topic
(8) Create a topic probabilities dataframe, with each topic in columns and values containing probabilities
HOW DOES THIS GET TO A SINGLE PROBABILITY PER RECORD, RATHER THAN PROBABILITIES BY WORD???
(9) Add the topic probabilites to original dataframe


TO DO:

10) Add further columns turning probabilities into boolean values at a given cut off point
11) Add logging feature to track outputs with given variables to improve tweaking performance

"""

'\n(0) Load df, trim columns. Remove unwanted record categories (trees, lawfulness etc)\n(1) Extract useful data like NPD references and permissions references and add as columns\n(1) Preprocess text (remove particular words, remove stopwords, lemmatize)\n(2) Create dictionary with an int ID for each remaining word\n(3) Filter the dictionary down if desired, removing very rare or very common words\n(4) Create BOW for each doc - a list of tuples for each record:\n    [[(int ref from the dictionary, word frequency)],[(int ref from the dictionary, word frequency)]]\n(5) Train the lda model, providing the dictionary, no of topics, no of passes etc.\n(6) Adjust preprocessing steps or variables in (5) to tweak topic model output to meaningful topics\n(7) Topic model returns a list of topics, with each topic containing tuples of the word id and the probabilities\nthat the word will feature in a text of that topic\n(8) Create a topic probabilities dataframe, with each topic in columns and valu

In [4]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
from datetime import date

In [5]:
#import nltk
#nltk.download('wordnet') # might not be needed once run once
#nltk.download('omw-1.4')

In [6]:
# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")

In [7]:
df = og_df.copy()

In [8]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000

In [9]:
def drop_col_by_labels(df,column_names):
    for col in column_names:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

In [10]:
def cut_df_down(df):
    
    #Define attribute columns with data on the permission itself
    info_cols = ['Unnamed: 0', 'Address', 'docfragment', 'file_ext', 'filename', 'Decision', 'Link','FromSearch', 'ToSearch', 'AppTypeFrag']
    
    # Define columns to keep for the NLP work
    keep_cols = ['Ref', 'Description', 'Dev_Type','DecDate', 'RefusalReasons']
    
    # find any other columns not in the above categories to remove - these will be those already manually categorised
    x_train_cols = [col for col in df.columns if col not in info_cols and col not in keep_cols]
    
    # for unsupervised model, remove unneeded cols + manually categorised data
    remove_cols = info_cols + x_train_cols
    
    # remove unwanted columns
    df = drop_col_by_labels(df,remove_cols)
    
    return df

In [11]:
def strip_numbers(text):
    
    # create a list of characters if the character is a letter or a space
    strip_numbers = [char for char in text if char.isalpha() or char == " "]
    
    # join the characters again with 'nothing' - as spaces are included above
    strip_numbers = "".join(strip_numbers)
    
    # return the string
    return strip_numbers

In [12]:
def strip_specific_text(text):
    
    text = text.lower()
    # removes common/unwanted/not valuable text and phrases from string
    
    # for each thing you want to remove from the text
    for phrase in master_list:
        # if the lower case version is in the lower case version of the text, replace it with nothing (delete)
        if phrase.lower() in text.lower():
            #print(phrase, " in text")
            text = text.replace(phrase.lower(), "")
    
    return text

In [13]:
# simple func to return length. Used to apply to a list of values
def leng_func(x):
    return len(x)

In [14]:
def Perm_regex(text):
    
    # Identify permission references
    matchtype = r'(?i)(?:PA)?\d{2}[/|_]\d{5}' # e.g. PA12/12345
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
    
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)

    return matches

In [15]:
def CLP_regex(text):
    matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP) (?:\d{4}\s*-\s*\d{4}|\d{4}?)?'
    #matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP)\s*((\(\d{4}(?:\s*-\s*\d{4})?\))|\d{4}\s*-\s*\d{4}|\d{4})?'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [16]:
def AONB_regex(text):
    matchtype = r'(?i)\barea\s*of\s*outstanding\s*natural\s*beauty\b|\bAONB\b|\(AONB\)'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [17]:
def NPD_regex(text):
    matchtype = r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:N(?:eighbourhood)?\s*(?:D(?:evelopment)?\s*)?P(?:lan)?|NDP)\b(?:\s+\([^\)]+\))?(?:\s+\d{4}(?:\s*(?:to|-)\s*\d{4})?)?'    
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [18]:
def saints_rule(text):
    # removes full stop in St. before extracting parish names so it is not read as a full stop
    if 'St. ' in text:
        text.replace('St. ', 'St ')
    #specific case where the word 'form' skews results, but we don't want to lose 'information'
    if ' form' in text:
        text.replace( ' form', ' ')
    return text

In [19]:
def stem(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(text)

In [20]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

In [21]:
def stopwords_and_preprocess(text):
    # creates a list of all words passed in if they are not stopwords or v.sml and returns lemmatized version
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            # specifiy preprocessing steps here - stemming / lemmatization, both
            # result.append(lemmatize_stemming(token))
            # result.append(stemming(token))
            result.append(lemmatize(token))
    return result

In [22]:
def list_all_values_in_col(col):
    # return a set of all the different values (all values represented only once)
    all_list = set(df[col].tolist())
    # turn the set into a list
    all_list = [item for item in all_list]
    
    # from the list that can contain sublists, flatten into single list
    new_list = []
    for item in all_list:
        if item == '':
            continue
        sublist = item.split(',')
        for item in sublist:
            new_list.append(item)
    
    return new_list

In [23]:
# function that takes a set of values to be replaced with a single value
def replace_text(text, string_set, returnstr):
    for string in string_set:
        text = text.replace(string, returnstr)
    return text

In [24]:
def seed_lda_multi():
    
    # THIS VERSION ALLOWS YOU TO WEIGHT SEED WORDS FOR EACH TOPIC TO HELP TO PROCESS
    # Define the seed word lists for each topic
    seed_words_topic1 = ['flood', 'drainage', 'risk']
    seed_words_topic2 = ['access', 'safe', 'highway','visability', 'vehicle']
    seed_words_topic3 = ['countryside', 'open', 'rural', 'location','aonb']
    seed_words_topic4 = ['heritage', 'world', 'historic']
    seed_words_topic5 = ['conservation', 'special', 'enhance','protect']
    seed_words_topic6 = ['neighbour', 'amenities', 'overlook', 'overbear']
    seed_words_topic7 = ['design', 'scale', 'mass']
    # Define more seed word lists for other topics if needed

    num_topics = 7

    # Train the LDA model using the seed words for each topic
    lda_model = gensim.models.LdaModel(bow_corpus, num_topics=num_topics,
                                       id2word=dictionary,
                                       passes=10,
                                       alpha=[0.01]*num_topics,
                                       eta=[[0.8 if word in seed_words_topic1 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic2 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic3 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic4 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic5 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic6 else 0.001 for word in dictionary.token2id],
                                            [0.8 if word in seed_words_topic7 else 0.001 for word in dictionary.token2id]])

    # Print the topics with their top 10 most important words
    for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
        print('Topic: {} \nWords: {}'.format(idx, topic))
        print('\n')
        
    return lda_model

In [25]:
def seed_lda_single():

    # THIS VERSION ALLOWS YOU TO PROVIDE A LIST OF SEED WORDS TO WEIGHT MORE HIGHLY
    # Specify seed topics that are particularly important in assigning topics
    seed_topic = ['aonb','flood','heritage','conservation','neighbour','countryside', 'affordable']

    # Set the weight of the above seed topics to be high as default - 0.5
    for word in seed_topic:
        for i in range(lda_model.num_topics):
            lda_model.get_topic_terms(i)
            if word in dict(lda_model.get_topic_terms(i)):
                lda_model.get_topic_terms(i)[dict(lda_model.get_topic_terms(i)).get(word)][1] = 1

    # re-train the LDA model with the updated topic distributions
    lda_model.update(bow_corpus)

    # TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights
    for idx, topic in lda_model.print_topics(-1):
        print("Topic: {} \nWords: {}".format(idx, topic))
        print("\n")
    
    return lda_model

In [26]:
def train_model(num_topics, num_passes):
    #Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
    #id2word is a mapping from word ids (integers) to words (strings)
    lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics = num_topics, # select no of topics to try and create from samples
                                       id2word = dictionary, # above comment                                 
                                       passes = num_passes, # number of passes the model with make (> passes = more thorough????)
                                       workers = 2) # no of extra processes to use for parallelization. Uses all available cores by default
    return lda_model

In [27]:
def make_topics_df(lda_model, num_topics, num_passes):
    
    topic_words = {}
    for topic_num in range(num_topics):
        words_probs = lda_model.show_topic(topic_num, topn=10)
        words = [word for word, _ in words_probs]
        probs = [prob for _, prob in words_probs]
        topic_words[f"Topic {topic_num}"] = words + probs

    topics_df = pd.DataFrame(topic_words)
    
    return topics_df

In [28]:
def make_topic_ranges():
    # for passes ranging from 50 to 450, adding 100 passes each time
    #for i in range(50, 450, 100):
    for i in range(100,300,100):
        num_passes = i
        # for a number of topics ranging from 5 to 10
        for b in range(10,15):
            num_topics = b
            print("Passes: {}, Topics: {}".format(num_passes, num_topics))
            scenario = "P_" + str(num_passes) + "_T_" + str(num_topics)
            lda_model = train_model()
            topics_dict[scenario] = make_topics_df(lda_model, num_topics, num_passes)

In [29]:
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [30]:
# chop the df down to only the bits you will use
df = cut_df_down(df)

In [31]:
# create a lemmatized version of the list
text_to_remove2 = [lemmatize(word) for word in text_to_remove]

In [32]:
# get rid of confusing 'St.' strings
df["RefusalReasons"] = df["RefusalReasons"].apply(saints_rule)
# create a column containing a string of references to NPDs
df['NDPs Referenced'] = df["RefusalReasons"].apply(NPD_regex)
# create a column containing a string of references to CLP
df['CLPs Referenced'] = df["RefusalReasons"].apply(CLP_regex)
# create a column containing a string of permission references
df['Perms Referenced'] = df["RefusalReasons"].apply(Perm_regex)
# create a column containing a string of AONB references
df['AONB Referenced'] = df["RefusalReasons"].apply(AONB_regex)

In [33]:
# from an initial list which could contain lists itself, return a single list containing
# all values in the column, cast to a set to remove duplicates
all_types_list = set(list_all_values_in_col('Dev_Type'))
all_perms_list = set(list_all_values_in_col('Perms Referenced'))
all_NPs_list = set(list_all_values_in_col('NDPs Referenced'))
all_CLPs_list = set(list_all_values_in_col('CLPs Referenced'))
all_AONB_list = set(list_all_values_in_col('AONB Referenced'))

# create a list of all these sets bar Dev_Type (kept) and AONB (cleaned)
master_list = [all_perms_list, all_NPs_list, all_CLPs_list]
# for each of the lists of sets, add all the values together, then convert back to a list for sorting
master_list = list(set().union(*master_list))
# add the extracted phrases to remove to the more generic phrases you have manually added on exploration
master_list = text_to_remove + text_to_remove2 + master_list
# sort the values by largest number of characters first (so .replace will be greedy)
master_list.sort(reverse = True, key=leng_func)

In [34]:
# Tidy the various AONB references to return only 'AONB'
# replace null values with empty string
df['AONB Referenced'].fillna('', inplace=True)
# update non-null values with 'AONB'
df.loc[df['AONB Referenced'] != '', 'AONB Referenced'] = 'AONB'

In [35]:
# replace all iterations of AONB with 'AONB' in the main data
df['RefusalReasons'] = df['RefusalReasons'].apply(replace_text, args=(all_AONB_list, 'AONB'))

In [36]:
# show how many records there are for each development category
df.groupby('Dev_Type')['Dev_Type'].describe().sort_values(by='freq', ascending=False)

Unnamed: 0_level_0,count,unique,top,freq
Dev_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Minor - Dwelling,1048,1,Minor - Dwelling,1048
Householder,257,1,Householder,257
All other minor developments,154,1,All other minor developments,154
CLEUD/CLOPED,144,1,CLEUD/CLOPED,144
Changes of Use,103,1,Changes of Use,103
TPO applications,78,1,TPO applications,78
Smallscale Major Dwellings,66,1,Smallscale Major Dwellings,66
Minor - Dwelling - PIP apps only,64,1,Minor - Dwelling - PIP apps only,64
Listed Building Consent (alter/extend),48,1,Listed Building Consent (alter/extend),48
All other small scale major developments,21,1,All other small scale major developments,21


In [37]:
'''NOT SURE IF I WANT TO DO THIS YET - THIS CREATES A DATAFRAME FOR EVERY CATEGORY IN THE LIST
    ALLOWS YOU TO EXPLORE THE DATA A LITTLE EASIER TO SEE WHAT TO INCLUDE/EXCLUDE'''
df_dict ={}
for item in all_types_list:
    df_dict[item] = df[df['Dev_Type']==item]

In [38]:
'''First cut the source data down by exluding categories that don't share common refusal reasons - e.g.
very specific types of development'''

# maybe exclude:
# CLUED - Certificate of Lawful Development - these generally refer to whether the use of something
# is lawful or not, so typically refusal reasons differ here
# TPO/TPA - Tree Protection  - generally refusals are for public amenity reasons
#'Listed Building Consent (alter/extend)' - borderline one because it's to do with alterations/extensions

exclude_cats = ['CLEUD/CLOPED','TPO applications','Not Required - Hedgerow Removal','TCA applications']
# Prior Approval - AF2 type == agricultural stuff?
borderline_cats = ['Listed Building Consent (alter/extend)','Prior Approval - AF2 type','Notification - Rail type']

In [39]:
# filter categories to exclude from the df. '~' negates/gives you the opposite of something
df = df[~df['Dev_Type'].isin(exclude_cats)]

In [40]:
# as you've removed entries from the index, they will be out of order, reset to fix
# also required when applying boolean masks later, or when merging on results by index later
df = df.reset_index()

In [41]:
# intital text stripping, including phrases as opposed words and common sentences
# plus stripping of perm references etc, plus stripping of lemmatized versions of text_to_remove
df['cleaned'] = df['RefusalReasons'].apply(strip_specific_text)

In [42]:
df['cleaned'] = df['cleaned'].apply(strip_numbers)

In [43]:
df['cleaned'] = df['cleaned'].apply(stopwords_and_preprocess)

In [44]:
# 'cleaned' is an object datatype, cast as str to allow you to search for substrings for refinement
df['checking col'] = df['cleaned'].astype(str)

In [45]:
# allows you to investigate word fragements and oddities to clean them up in future iterations
#df_topic_dict[5]['cleaned'] = df_topic_dict[5]['cleaned'].astype(str)
#df_topic_dict[5].columns
#df_topic_dict[5][df_topic_dict[5]['cleaned'].str.contains('ination')]
#df[df['checking col'].str.contains('beauty')]
#df[~df['test'].isnull()]

In [46]:
# create a list of lists of the preprocessed docs, where each list is the preprocessed text
processed_docs = df['cleaned'].to_list()

In [47]:
# Create dict from 'processed_docs'; each word and an integer ID. Accessed as usual dictionary[0]
# Later passed to the model for training. Besides the id and word, it also contains frequency info
dictionary = gensim.corpora.Dictionary(processed_docs)

In [48]:
# OPTIONAL STEP Remove very rare and very common words: appearing < 15 times, > 30% of all documents
# keep_n=100000 == only the 100,000 most frequent tokens in the corpus will be kept
#dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n= 100000)

In [49]:
#Create BOW model for each doc i.e for each we create a list of tuples (int_ref, count)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [50]:
# just run this if you know the num_topics and num_passes you want to use
# best iteration currently 8 topics, 100 passes
num_topics = 20
num_passes = 100
lda_model = train_model(num_topics, num_passes)
topics_df = make_topics_df(lda_model, num_topics, num_passes)

In [51]:
# USE THIS VERSION TO RUN MULTIPLE ITERATIONS OF PASSES/TOPICS
# create a dictionary to house iterations of the model and it's various passes, topic numbers and top 10 words
#topics_dict = {}
#make_topic_ranges()

In [52]:
# get todays date in format (dd_mm_yy)
today = "(" + date.today().strftime("%d_%m_%Y") + ")"
# create filename with variables used in run and the date
filename = "LDA_topics_" + str(num_topics) + "_passes_" + str(num_passes) + today

In [53]:
topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,provision,access,neighbour,residential,aonb,access,reference,harm,character,residential,need,conservation,environment,conservation,species,countryside,heritage,space,flood,countryside
1,secure,safe,overlook,neighbour,landscape,boundary,approve,public,design,open,affordable,harm,harm,special,protect,character,world,amenity,risk,character
2,affordable,highway,property,noise,rural,reverse,exist,substantial,fail,countryside,business,character,natural,integrity,absence,absence,landscape,future,assessment,location
3,mechanism,suitable,design,proximity,set,open,country,open,scale,location,location,list,visual,adversely,biodivery,open,mine,park,zone,unsustainable
4,absence,fail,loss,close,conserve,harm,fail,require,appearance,require,evidence,heritage,location,combination,survey,special,harm,wall,absence,facilities
5,contributions,visibility,privacy,properties,beauty,field,grant,schedule,exist,reliant,fail,appearance,delivery,influence,advice,harm,universal,occupants,fail,service
6,conservation,vehicles,balcony,occupiers,enhance,view,change,ably,surround,unsustainable,purpose,fail,fail,zone,ecological,appearance,outstanding,standard,water,rural
7,financial,road,overbear,disturbance,management,incremental,subject,provision,mass,rural,market,public,limit,project,natural,rural,historic,character,surface,open
8,need,park,rear,amenities,character,north,matter,furthermore,appear,fail,rural,historic,provision,ability,adverse,infill,character,loss,test,sustainable
9,identify,safety,amenity,exist,residential,turn,reserve,exist,cramp,design,essential,set,economic,mitigate,assess,location,value,scheme,community,private


In [54]:
# create an Excel writer object
writer = pd.ExcelWriter(filename + '.xlsx', engine='xlsxwriter')

# if topics_dict exists (e.g. if you ran multiple iterations)
if 'topics_dict' in locals() and isinstance(topics_dict, dict):
    print('topics_dict saved - one sheet per iteration')
    # loop over the dictionary and write each dataframe to a separate worksheet
    for sheet_name, df in topics_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)
else:
    # if you just ran a single scenario
    scenario = "P_" + str(num_passes) + "_T_" + str(num_topics)
    df.to_excel(writer, sheet_name=scenario, index=False)
    print('single scenario saved')

writer.save()

single scenario saved


In [55]:
scenario

'P_100_T_20'

In [56]:
topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,provision,access,neighbour,residential,aonb,access,reference,harm,character,residential,need,conservation,environment,conservation,species,countryside,heritage,space,flood,countryside
1,secure,safe,overlook,neighbour,landscape,boundary,approve,public,design,open,affordable,harm,harm,special,protect,character,world,amenity,risk,character
2,affordable,highway,property,noise,rural,reverse,exist,substantial,fail,countryside,business,character,natural,integrity,absence,absence,landscape,future,assessment,location
3,mechanism,suitable,design,proximity,set,open,country,open,scale,location,location,list,visual,adversely,biodivery,open,mine,park,zone,unsustainable
4,absence,fail,loss,close,conserve,harm,fail,require,appearance,require,evidence,heritage,location,combination,survey,special,harm,wall,absence,facilities
5,contributions,visibility,privacy,properties,beauty,field,grant,schedule,exist,reliant,fail,appearance,delivery,influence,advice,harm,universal,occupants,fail,service
6,conservation,vehicles,balcony,occupiers,enhance,view,change,ably,surround,unsustainable,purpose,fail,fail,zone,ecological,appearance,outstanding,standard,water,rural
7,financial,road,overbear,disturbance,management,incremental,subject,provision,mass,rural,market,public,limit,project,natural,rural,historic,character,surface,open
8,need,park,rear,amenities,character,north,matter,furthermore,appear,fail,rural,historic,provision,ability,adverse,infill,character,loss,test,sustainable
9,identify,safety,amenity,exist,residential,turn,reserve,exist,cramp,design,essential,set,economic,mitigate,assess,location,value,scheme,community,private


In [57]:
"""# TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights
for idx, topic in lda_model.print_topics(-1):
    df[topic] = topic
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")"""

'# TWEAKING - For each topic, explore the words occuring in that topic and thier relative weights\nfor idx, topic in lda_model.print_topics(-1):\n    df[topic] = topic\n    print("Topic: {} \nWords: {}".format(idx, topic))\n    print("\n")'

In [58]:
#topics_df.sort_values(by=[5],ascending=False)

In [74]:
# set a dictionary to rena

inferred_topics = {
    0: "Affordable Housing",
    1: "Access",
    2: "Neighbours and amenties",
    3: "Noise & Disturbance",
    4: "AONB, Landscape & Appearance",
    5: 5,
    6: 6,
    7: 7,
    8: "Design / Character",
    9: 9,
    10: 10,
    11: "Conservation & harm to Listed Buildings",
    12: 12,
    13: 13,
    14: "Natural Environment",
    15: 15,
    16: "Heritage and History",
    17: "Space and amenities",
    18: "Flooding and Drainage",
    19: "Countryside & Character",
    }

In [70]:
# Save the model
lda_model.save('lda_model')

# To load the saved model
#from gensim.models import LdaModel
#lda_model = LdaModel.load('lda_model')

In [75]:
topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,provision,access,neighbour,residential,aonb,access,reference,harm,character,residential,need,conservation,environment,conservation,species,countryside,heritage,space,flood,countryside
1,secure,safe,overlook,neighbour,landscape,boundary,approve,public,design,open,affordable,harm,harm,special,protect,character,world,amenity,risk,character
2,affordable,highway,property,noise,rural,reverse,exist,substantial,fail,countryside,business,character,natural,integrity,absence,absence,landscape,future,assessment,location
3,mechanism,suitable,design,proximity,set,open,country,open,scale,location,location,list,visual,adversely,biodivery,open,mine,park,zone,unsustainable
4,absence,fail,loss,close,conserve,harm,fail,require,appearance,require,evidence,heritage,location,combination,survey,special,harm,wall,absence,facilities
5,contributions,visibility,privacy,properties,beauty,field,grant,schedule,exist,reliant,fail,appearance,delivery,influence,advice,harm,universal,occupants,fail,service
6,conservation,vehicles,balcony,occupiers,enhance,view,change,ably,surround,unsustainable,purpose,fail,fail,zone,ecological,appearance,outstanding,standard,water,rural
7,financial,road,overbear,disturbance,management,incremental,subject,provision,mass,rural,market,public,limit,project,natural,rural,historic,character,surface,open
8,need,park,rear,amenities,character,north,matter,furthermore,appear,fail,rural,historic,provision,ability,adverse,infill,character,loss,test,sustainable
9,identify,safety,amenity,exist,residential,turn,reserve,exist,cramp,design,essential,set,economic,mitigate,assess,location,value,scheme,community,private


In [81]:
# Get the topic probabilities for each document in the corpus
document_topics = [lda_model.get_document_topics(bow) for bow in bow_corpus]

In [82]:
# Initialize a list to contain dictionaries with topic_num, topic probabilities for each document
topic_probs_list = []

# Iterate over the documents in the corpus
for doc in bow_corpus:
    # Get the topic and topic probabilities for the current document
    topic_probs = dict(lda_model.get_document_topics(doc))
    # Add the topic probabilities to the list
    topic_probs_list.append(topic_probs)

# Create a Pandas DataFrame from the list of dictionaries
topic_probs_df = pd.DataFrame(topic_probs_list)

In [83]:
# add the topic probabilities dataframe to the original df by index
df = pd.merge(df, topic_probs_df, left_index=True, right_index=True)

In [84]:
# create dictionary of dfs where each df has that topic flag to quickly evaluate the groups
df_topic_dict ={}

In [85]:
for i in range(0,num_topics):
    # drop other columns to make it easier to evaulate
    rem_cols = ['Ref','Description','DecDate','NDPs Referenced','CLPs Referenced','Perms Referenced','AONB Referenced','checking col']
    # sort each topic df by it's topic number (highest probabilities first), after first removing nulls
    df_topic_dict[i] = df[~df[i].isnull()].sort_values(by=[i], ascending=False)
    print(len(df_topic_dict[i]))
    
    # add other topic columns to the list to be removed to make it easier to see
    for col in df_topic_dict[i].columns:
        if df_topic_dict[i][col].dtype in ['int64', 'float64'] and col != i and col !='index':
            rem_cols.append(col)
            df_topic_dict[i]= drop_col_by_labels(df_topic_dict[i],rem_cols)

198
351
247
235
519
142
127
161
625
135
258
552
228
252
231
515
241
263
205
589


In [86]:
df.head()

Unnamed: 0,index,Ref,Description,DecDate,RefusalReasons,Dev_Type,NDPs Referenced,CLPs Referenced,Perms Referenced,AONB Referenced,cleaned,checking col,15,19,2,8,14,4,7,9,1,17,6,5,10,18,11,3,16,13,12,0
0,0,PA19/08993,Self Build Single Storey Dwelling,31/10/2019,"The application site is located outside of the established and definable limits of any settlement, and does not represent infill or rounding off as defined in policy 3 of the Cornwall Local Plan Strategic Policies 2010 - 2030. The proposal would represent a residential unit of accommodation in the countryside where no special justification has been satisfactorily demonstrated. As a consequence the proposed development is considered to represent undesirable, unsustainable, sporadic residential development which is beyond any defined development limit eroding the character and appearance of this part of the Area of Great Landscape Value. The development is therefore contrary to policy 3, 7 and 23 of The Cornwall Local Plan and paragraph 79 and Section 15 of the National Planning Policy Framework 2019.",Minor - Dwelling,,"Cornwall Local Plan Strategic Policies,Cornwall Local Plan",,,"[locate, outside, establish, definable, limit, infill, round, define, residential, unit, accommodation, countryside, satisily, consequence, undesirable, unsustainable, sporadic, residential, define, limit, erode, character, appearance, great, landscape, value]","['locate', 'outside', 'establish', 'definable', 'limit', 'infill', 'round', 'define', 'residential', 'unit', 'accommodation', 'countryside', 'satisily', 'consequence', 'undesirable', 'unsustainable', 'sporadic', 'residential', 'define', 'limit', 'erode', 'character', 'appearance', 'great', 'landscape', 'value']",0.619888,0.346771,,,,,,,,,,,,,,,,,,
1,1,PA19/08982,Proposed replacement dwelling with variation of condition 2 in respect of decision PA19/03272 dated 11.09.19,22/10/2019,"1 The proposed amendments to the approved development by reason of the increased scale, height and mass of the dwelling would significantly increase the visual prominence of the dwelling within a sensitive exposed location, with a high level of public experience and as such the dominant visual appearance would be detrimental to the landscape character and visual amenity of the Area of Great Landscape Value and to users of the beach, South West Coast Path and footpath. This is therefore contrary to Policy 2, 12 and 23 of The Cornwall Local Plan Strategic Policies 2010-2030 and paragraph 127 and 170 of the National Planning Policy Framework 2019.",Minor - Dwelling,,Cornwall Local Plan Strategic Policies,,,"[amendments, approve, increase, scale, height, mass, significantly, increase, visual, prominence, senive, expose, location, high, level, public, experience, dominant, visual, appearance, detrimental, landscape, character, visual, amenity, great, landscape, value, users, beach, south, west, coast, path, footpath]","['amendments', 'approve', 'increase', 'scale', 'height', 'mass', 'significantly', 'increase', 'visual', 'prominence', 'senive', 'expose', 'location', 'high', 'level', 'public', 'experience', 'dominant', 'visual', 'appearance', 'detrimental', 'landscape', 'character', 'visual', 'amenity', 'great', 'landscape', 'value', 'users', 'beach', 'south', 'west', 'coast', 'path', 'footpath']",,0.195716,0.170272,0.480105,0.131682,,,,,,,,,,,,,,,
2,2,PA19/09063,Outline planning permission with all matters reserved. Detached single dwelling and garage,11/10/2019,"The proposed development raises direct conflict with the requirements of adopted policy in the Cornwall Local Plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances, none of which have been identified in this particular case. The proposed development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside, thus not conserving the landscape character and natural beauty of the AONB AONB in this location. The site is not a sustainable or accessible location for a new dwelling and the development would therefore be contrary to development plan policy and cause material environmental harm to the rural character of the area, contrary to policies 1, 2, 3, 7, 12, 23 and 27 in the Cornwall Local Plan Strategic Policies 2010 - 2030 and paragraphs 79, 108, 110, 170 a...",Minor - Dwelling,,"Cornwall Local Plan Strategic Policies,Cornwall Local Plan",,AONB,"[direct, requirements, clearly, locate, open, countryside, home, special, identify, particular, case, clearly, erode, rural, character, location, introduce, material, harm, character, appearance, countryside, conserve, landscape, character, natural, beauty, aonb, aonb, location, sustainable, accessible, location, cause, material, environmental, harm, rural, character]","['direct', 'requirements', 'clearly', 'locate', 'open', 'countryside', 'home', 'special', 'identify', 'particular', 'case', 'clearly', 'erode', 'rural', 'character', 'location', 'introduce', 'material', 'harm', 'character', 'appearance', 'countryside', 'conserve', 'landscape', 'character', 'natural', 'beauty', 'aonb', 'aonb', 'location', 'sustainable', 'accessible', 'location', 'cause', 'material', 'environmental', 'harm', 'rural', 'character']",0.263592,0.592903,,,,0.122254,,,,,,,,,,,,,,
3,3,PA19/08883,Erection of domestic garage with associated works.,21/10/2019,"The site forms part of a cohesive, high quality, residential development, set back from the road frontage with open front amenity areas. The proposed development by reason of its siting would visually compete with the host dwelling and introduce an inappropriate and overly dominant feature within the street scene, harming the visual amenity of the immediate surroundings. As such, the proposal is contrary to Policy 12 of the Cornwall Local Plan 2010-2030, adopted November 2016, together with the provisions of the National Planning Policy Framework 2019, with particular reference to Paragraph 127.",Householder,,Cornwall Local Plan,,,"[cohesive, high, quality, residential, road, frontage, open, amenity, visually, compete, host, introduce, inappropriate, overly, dominant, feature, scene, harm, visual, amenity, immediate, surround, provision, particular, reference]","['cohesive', 'high', 'quality', 'residential', 'road', 'frontage', 'open', 'amenity', 'visually', 'compete', 'host', 'introduce', 'inappropriate', 'overly', 'dominant', 'feature', 'scene', 'harm', 'visual', 'amenity', 'immediate', 'surround', 'provision', 'particular', 'reference']",,,,0.66524,,,0.225996,0.076068,,,,,,,,,,,,
4,4,PA19/08792,Outline application with some matters reserved for construction of two dwellings to round-off the residential developments of Goonearl,29/10/2019,"The proposal for two dwellings outside any settlement and without any special justification results in an unsustainable form of development in the countryside. The accessibility of the location in accessing modes of transport other than the private car is poor which will result in reliance on private cars for the majority of trips reinforcing the unsustainable location of the development. The proposal is therefore contrary to Cornwall Local Plan 2010-2030 policies 1, 3, 7 and 27, St Agnes Neighbourhood Development Plan policies 1 and 2 and the aims at paragraphs 8 and 79 of the National Planning Policy Framework.",Minor - Dwelling,St Agnes Neighbourhood Development Plan,Cornwall Local Plan,,,"[outside, unsustainable, countryside, accessibility, location, access, modes, transport, private, poor, reliance, private, cars, majority, trip, reinforce, unsustainable, location]","['outside', 'unsustainable', 'countryside', 'accessibility', 'location', 'access', 'modes', 'transport', 'private', 'poor', 'reliance', 'private', 'cars', 'majority', 'trip', 'reinforce', 'unsustainable', 'location']",,0.739189,,,,,,,0.156558,0.0595,,,,,,,,,,


In [87]:
# change the topic numbers in the header to your meaningful topic titles
df = df.rename(columns=inferred_topics)

In [102]:
# remove the topics you are not interested in (left as numbers)
rem_cols = []
for col in df.columns:
    if str(df[col].name).isdigit():
        print(col)
        rem_cols.append(col)
        df= drop_col_by_labels(df,rem_cols)

15
7
9
6
5
10
13
12


In [103]:
df.head()

Unnamed: 0,index,Ref,Description,DecDate,RefusalReasons,Dev_Type,NDPs Referenced,CLPs Referenced,Perms Referenced,AONB Referenced,cleaned,checking col,Countryside & Character,Neighbours and amenties,Design / Character,Natural Environment,"AONB, Landscape & Appearance",Access,Space and amenities,Flooding and Drainage,Conservation & harm to Listed Buildings,Noise & Disturbance,Heritage and History,Affordable Housing
0,0,PA19/08993,Self Build Single Storey Dwelling,31/10/2019,"The application site is located outside of the established and definable limits of any settlement, and does not represent infill or rounding off as defined in policy 3 of the Cornwall Local Plan Strategic Policies 2010 - 2030. The proposal would represent a residential unit of accommodation in the countryside where no special justification has been satisfactorily demonstrated. As a consequence the proposed development is considered to represent undesirable, unsustainable, sporadic residential development which is beyond any defined development limit eroding the character and appearance of this part of the Area of Great Landscape Value. The development is therefore contrary to policy 3, 7 and 23 of The Cornwall Local Plan and paragraph 79 and Section 15 of the National Planning Policy Framework 2019.",Minor - Dwelling,,"Cornwall Local Plan Strategic Policies,Cornwall Local Plan",,,"[locate, outside, establish, definable, limit, infill, round, define, residential, unit, accommodation, countryside, satisily, consequence, undesirable, unsustainable, sporadic, residential, define, limit, erode, character, appearance, great, landscape, value]","['locate', 'outside', 'establish', 'definable', 'limit', 'infill', 'round', 'define', 'residential', 'unit', 'accommodation', 'countryside', 'satisily', 'consequence', 'undesirable', 'unsustainable', 'sporadic', 'residential', 'define', 'limit', 'erode', 'character', 'appearance', 'great', 'landscape', 'value']",0.346771,,,,,,,,,,,
1,1,PA19/08982,Proposed replacement dwelling with variation of condition 2 in respect of decision PA19/03272 dated 11.09.19,22/10/2019,"1 The proposed amendments to the approved development by reason of the increased scale, height and mass of the dwelling would significantly increase the visual prominence of the dwelling within a sensitive exposed location, with a high level of public experience and as such the dominant visual appearance would be detrimental to the landscape character and visual amenity of the Area of Great Landscape Value and to users of the beach, South West Coast Path and footpath. This is therefore contrary to Policy 2, 12 and 23 of The Cornwall Local Plan Strategic Policies 2010-2030 and paragraph 127 and 170 of the National Planning Policy Framework 2019.",Minor - Dwelling,,Cornwall Local Plan Strategic Policies,,,"[amendments, approve, increase, scale, height, mass, significantly, increase, visual, prominence, senive, expose, location, high, level, public, experience, dominant, visual, appearance, detrimental, landscape, character, visual, amenity, great, landscape, value, users, beach, south, west, coast, path, footpath]","['amendments', 'approve', 'increase', 'scale', 'height', 'mass', 'significantly', 'increase', 'visual', 'prominence', 'senive', 'expose', 'location', 'high', 'level', 'public', 'experience', 'dominant', 'visual', 'appearance', 'detrimental', 'landscape', 'character', 'visual', 'amenity', 'great', 'landscape', 'value', 'users', 'beach', 'south', 'west', 'coast', 'path', 'footpath']",0.195716,0.170272,0.480105,0.131682,,,,,,,,
2,2,PA19/09063,Outline planning permission with all matters reserved. Detached single dwelling and garage,11/10/2019,"The proposed development raises direct conflict with the requirements of adopted policy in the Cornwall Local Plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances, none of which have been identified in this particular case. The proposed development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside, thus not conserving the landscape character and natural beauty of the AONB AONB in this location. The site is not a sustainable or accessible location for a new dwelling and the development would therefore be contrary to development plan policy and cause material environmental harm to the rural character of the area, contrary to policies 1, 2, 3, 7, 12, 23 and 27 in the Cornwall Local Plan Strategic Policies 2010 - 2030 and paragraphs 79, 108, 110, 170 a...",Minor - Dwelling,,"Cornwall Local Plan Strategic Policies,Cornwall Local Plan",,AONB,"[direct, requirements, clearly, locate, open, countryside, home, special, identify, particular, case, clearly, erode, rural, character, location, introduce, material, harm, character, appearance, countryside, conserve, landscape, character, natural, beauty, aonb, aonb, location, sustainable, accessible, location, cause, material, environmental, harm, rural, character]","['direct', 'requirements', 'clearly', 'locate', 'open', 'countryside', 'home', 'special', 'identify', 'particular', 'case', 'clearly', 'erode', 'rural', 'character', 'location', 'introduce', 'material', 'harm', 'character', 'appearance', 'countryside', 'conserve', 'landscape', 'character', 'natural', 'beauty', 'aonb', 'aonb', 'location', 'sustainable', 'accessible', 'location', 'cause', 'material', 'environmental', 'harm', 'rural', 'character']",0.592903,,,,0.122254,,,,,,,
3,3,PA19/08883,Erection of domestic garage with associated works.,21/10/2019,"The site forms part of a cohesive, high quality, residential development, set back from the road frontage with open front amenity areas. The proposed development by reason of its siting would visually compete with the host dwelling and introduce an inappropriate and overly dominant feature within the street scene, harming the visual amenity of the immediate surroundings. As such, the proposal is contrary to Policy 12 of the Cornwall Local Plan 2010-2030, adopted November 2016, together with the provisions of the National Planning Policy Framework 2019, with particular reference to Paragraph 127.",Householder,,Cornwall Local Plan,,,"[cohesive, high, quality, residential, road, frontage, open, amenity, visually, compete, host, introduce, inappropriate, overly, dominant, feature, scene, harm, visual, amenity, immediate, surround, provision, particular, reference]","['cohesive', 'high', 'quality', 'residential', 'road', 'frontage', 'open', 'amenity', 'visually', 'compete', 'host', 'introduce', 'inappropriate', 'overly', 'dominant', 'feature', 'scene', 'harm', 'visual', 'amenity', 'immediate', 'surround', 'provision', 'particular', 'reference']",,,0.66524,,,,,,,,,
4,4,PA19/08792,Outline application with some matters reserved for construction of two dwellings to round-off the residential developments of Goonearl,29/10/2019,"The proposal for two dwellings outside any settlement and without any special justification results in an unsustainable form of development in the countryside. The accessibility of the location in accessing modes of transport other than the private car is poor which will result in reliance on private cars for the majority of trips reinforcing the unsustainable location of the development. The proposal is therefore contrary to Cornwall Local Plan 2010-2030 policies 1, 3, 7 and 27, St Agnes Neighbourhood Development Plan policies 1 and 2 and the aims at paragraphs 8 and 79 of the National Planning Policy Framework.",Minor - Dwelling,St Agnes Neighbourhood Development Plan,Cornwall Local Plan,,,"[outside, unsustainable, countryside, accessibility, location, access, modes, transport, private, poor, reliance, private, cars, majority, trip, reinforce, unsustainable, location]","['outside', 'unsustainable', 'countryside', 'accessibility', 'location', 'access', 'modes', 'transport', 'private', 'poor', 'reliance', 'private', 'cars', 'majority', 'trip', 'reinforce', 'unsustainable', 'location']",0.739189,,,,,0.156558,0.0595,,,,,


In [109]:
def export_csv():
    
    path = "/Users/GlassShark1/Python/Refusals Data/"
    filepath = Path(path + filename + ".csv") 
    print(filepath)
    #filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath) 

In [110]:
export_csv()

/Users/GlassShark1/Python/Refusals Data/LDA_topics_20_passes_100(12_05_2023)new.csv


In [113]:
from pathlib import Path
path = "/Users/GlassShark1/Python/Refusals Data/"
filepath = Path(path + filename + "topics.csv") 
print(filepath)
#filepath.parent.mkdir(parents=True, exist_ok=True)  
topics_df.to_csv(filepath)

/Users/GlassShark1/Python/Refusals Data/LDA_topics_20_passes_100(12_05_2023)topics.csv


In [111]:
topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,provision,access,neighbour,residential,aonb,access,reference,harm,character,residential,need,conservation,environment,conservation,species,countryside,heritage,space,flood,countryside
1,secure,safe,overlook,neighbour,landscape,boundary,approve,public,design,open,affordable,harm,harm,special,protect,character,world,amenity,risk,character
2,affordable,highway,property,noise,rural,reverse,exist,substantial,fail,countryside,business,character,natural,integrity,absence,absence,landscape,future,assessment,location
3,mechanism,suitable,design,proximity,set,open,country,open,scale,location,location,list,visual,adversely,biodivery,open,mine,park,zone,unsustainable
4,absence,fail,loss,close,conserve,harm,fail,require,appearance,require,evidence,heritage,location,combination,survey,special,harm,wall,absence,facilities
5,contributions,visibility,privacy,properties,beauty,field,grant,schedule,exist,reliant,fail,appearance,delivery,influence,advice,harm,universal,occupants,fail,service
6,conservation,vehicles,balcony,occupiers,enhance,view,change,ably,surround,unsustainable,purpose,fail,fail,zone,ecological,appearance,outstanding,standard,water,rural
7,financial,road,overbear,disturbance,management,incremental,subject,provision,mass,rural,market,public,limit,project,natural,rural,historic,character,surface,open
8,need,park,rear,amenities,character,north,matter,furthermore,appear,fail,rural,historic,provision,ability,adverse,infill,character,loss,test,sustainable
9,identify,safety,amenity,exist,residential,turn,reserve,exist,cramp,design,essential,set,economic,mitigate,assess,location,value,scheme,community,private


In [None]:
""" CORE MODEL VERSION
#Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
#id2word is a mapping from word ids (integers) to words (strings)
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = num_topics, # select no of topics to try and create from samples
                                   id2word = dictionary, # above comment                                 
                                   passes = num_passes, # number of passes the model with make (> passes = more thorough????)
                                   workers = 2) # no of extra processes to use for parallelization. Uses all available cores by default
                                   """

In [None]:
#unseen_document = "The proposed development raises direct conflict with the requirements of adopted policy in the Cornwall Local Plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances, none of which have been identified in this particular case. The proposed development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside, thus not conserving the landscape character and natural beauty of the Area of Outstanding Natural Beauty (AONB) in this location. The site is not a sustainable or accessible location for a new dwelling and the development would therefore be contrary to development plan policy and cause material environmental harm to the rural character of the area, contrary to policies 1, 2, 3, 7, 12, 23 and 27 in the Cornwall Local Plan Strategic Policies 2010 - 2030 "

In [None]:
"""# Data preprocessing step for the unseen document
# bow_vector = list of tuples with number representing a word in the corpus and the count of those words???
bow_vector = dictionary.doc2bow(stopwords_and_preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    # number at the end seems to return n words and the strength of their relationship to topic????
    print("Score: {}\n Topic: {}\n".format(score, lda_model.print_topic(index, 10)))"""

In [None]:
# set up logging files to keep track of model / topics etc

In [None]:
'''Preview BOW for our sample preprocessed document - this is on the 20th record as an example'''
"""
document_num = 0 #think this is just the 20th record?

bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))"""

In [None]:
"""ASK MN: DON'T UNDERSTAND THIS WELL - CAN'T SEE WHERE IT'S APPLIED??

alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is 1/num_topics)

Alpha is the per document topic distribution.

High alpha: Every document has a mixture of all topics(documents appear similar to each other).
Low alpha: Every document has a mixture of very few topics
Eta is the per topic word distribution.

High eta: Each topic has a mixture of most words(topics appear similar to each other).
Low eta: Each topic has a mixture of few words."""

In [None]:
'''Checking dictionary created'''
"""count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 100:
        break"""