In [1]:
from nltk.corpus import stopwords # stopwords (most common words that don't mean much)
import spacy
import pandas as pd
import re
import string
from spacy.matcher import PhraseMatcher


# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")
df = og_df.copy()

In [2]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000
    
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [3]:
def cleaning_text(text):
    
    # replaces instances of more than 1 space with a single space
    text = re.sub(' +', ' ', text)
    # strips any unwanted whitespace from either end
    text = text.rstrip().lstrip()
    
    return text

In [4]:
def saints_rule(text):
    # removes full stop in St. before extracting parish names so it is not read as a full stop
    if 'St. ' in text:
        text.replace('St. ', 'St ')
    #specific case where the word 'form' skews results, but we don't want to lose 'information'
    if ' form' in text:
        text.replace( ' form', ' ')
    return text

In [5]:
# simple func to return length. Used to apply to a list of values
def leng_func(x):
    return len(x)

In [6]:
# function that takes a set of values to be replaced with a single value
def replace_text(text, string_set, returnstr):
    for string in string_set:
        text = text.replace(string, returnstr)
    return text

In [7]:
def drop_col_by_labels(df,column_names):
    for col in column_names:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

In [8]:
def Perm_regex(text):
    
    # Identify permission references
    matchtype = r'(?i)(?:PA)?\d{2}[/|_]\d{5}' # e.g. PA12/12345
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
    
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)

    return matches

In [9]:
def CLP_regex(text):
    matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP) (?:\d{4}\s*-\s*\d{4}|\d{4}?)?'
    #matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP)\s*((\(\d{4}(?:\s*-\s*\d{4})?\))|\d{4}\s*-\s*\d{4}|\d{4})?'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [10]:
def AONB_regex(text):
    matchtype = r'(?i)\barea\s*of\s*outstanding\s*natural\s*beauty\b|\bAONB\b|\(AONB\)'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [11]:
def NPD_regex(text):
    matchtype = r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:N(?:eighbourhood)?\s*(?:D(?:evelopment)?\s*)?P(?:lan)?|NDP)\b(?:\s+\([^\)]+\))?(?:\s+\d{4}(?:\s*(?:to|-)\s*\d{4})?)?'    
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [12]:
# Define a function to match categories in the text and create category columns
def match_categories(text):
    
    # initializes a PhraseMatcher object named matcher using the vocabulary of the nlp model
    matcher = PhraseMatcher(nlp.vocab)
    # empty dictionary to store the matched category sentences
    category_sentences = {}

    # for each category and it's examples - this bit is about getting the matcher to understand your patterns
    # and store them efficiently. It is an instance of the PhraseMatcher class
    for category, examples in example_snippets.items():
        # converts into spaCy Doc objects
        category_patterns = [nlp(example) for example in examples]
        # adds them to the matcher object
        matcher.add(category, None, *category_patterns)
        # initializes empty list for each category in the category_sentences dictionary
        category_sentences[category] = []

    # processes the input text using the nlp model, creating a Doc object named doc
    doc = nlp(text)
    # matches the patterns in the matcher object against the doc, obtains matches in the matches variable
    matches = matcher(doc)

    # iterates over each match and retrieves the matched category and sentence
    for match_id, start, end in matches:
        matched_category = matcher.vocab.strings[match_id]
        matched_sentence = doc[start:end].text
        #  appends matched sentence to the corresponding category in the category_sentences dictionary
        category_sentences[matched_category].append(matched_sentence)
    
    return category_sentences

In [14]:
# remove spaces (spell check TBD)
df['RefusalReasons'] = df['RefusalReasons'].apply(cleaning_text)

In [15]:
# Tidying up refusalreasons text and extracting key info in columns


# get rid of confusing 'St.' strings
df["RefusalReasons"] = df["RefusalReasons"].apply(saints_rule)
# create a column containing a string of references to NPDs
df['NDPs Referenced'] = df["RefusalReasons"].apply(NPD_regex)
# create a column containing a string of references to CLP
df['CLPs Referenced'] = df["RefusalReasons"].apply(CLP_regex)
# create a column containing a string of permission references
df['Perms Referenced'] = df["RefusalReasons"].apply(Perm_regex)
# create a column containing a string of AONB references
df['AONB Referenced'] = df["RefusalReasons"].apply(AONB_regex)
# Tidy the various AONB references to return only 'AONB'
# replace null values with empty string
df['AONB Referenced'].fillna('', inplace=True)
# update non-null values with 'AONB'
df.loc[df['AONB Referenced'] != '', 'AONB Referenced'] = 'AONB'
# replace all iterations of AONB with 'AONB' in the main data
#df['RefusalReasons'] = df['RefusalReasons'].apply(replace_text, args=(all_AONB_list, 'AONB'))
# drop unused columns
df = drop_col_by_labels(df,['Unnamed: 0','Missing Dev_Type'])

In [None]:
'''First cut the source data down by exluding categories that don't share common refusal reasons - e.g.
very specific types of development

# maybe exclude:
# CLUED - Certificate of Lawful Development - these generally refer to whether the use of something
# is lawful or not, so typically refusal reasons differ here
# TPO/TPA - Tree Protection  - generally refusals are for public amenity reasons
#'Listed Building Consent (alter/extend)' - borderline one because it's to do with alterations/extensions

exclude_cats = ['CLEUD/CLOPED','TPO applications','Not Required - Hedgerow Removal','TCA applications']
# Prior Approval - AF2 type == agricultural stuff?
borderline_cats = ['Listed Building Consent (alter/extend)','Prior Approval - AF2 type','Notification - Rail type']

# filter categories to exclude from the df. '~' negates/gives you the opposite of something
df = df[~df['Dev_Type'].isin(exclude_cats)]

# as you've removed entries from the index, they will be out of order, reset to fix
# also required when applying boolean masks later, or when merging on results by index later
df = df.reset_index()

# intital text stripping, including phrases as opposed words and common sentences
# plus stripping of perm references etc, plus stripping of lemmatized versions of text_to_remove
df['cleaned'] = df['RefusalReasons'].apply(strip_specific_text)

df['cleaned'] = df['cleaned'].apply(strip_numbers)

df['cleaned'] = df['cleaned'].apply(stopwords_and_preprocess)

# 'cleaned' is an object datatype, cast as str to allow you to search for substrings for refinement
df['checking col'] = df['cleaned'].astype(str)


In [16]:
# Dictionary, each category is the key, each key has a list of snippets
example_snippets = {
    'Flooding and Drainage': [
        'flood risk',
        'flood zones',
        'flood risk assessment',
        'not been demonstrated that the site is safe from flooding',
        'policy 26',
        'other available sites at a lower risk of flooding',
        'significant risk of flooding',
        'not acceptable in terms of the risk of flooding',
        'critical drainage area.',
        'functional floodplain',
        'would not be appropriately flood resilient',
        'would not be safe access or egress in a 1 in 100-year flood event',
        'drainage strategy',
        'surface water drainage disposal',
        'surface water disposal'
    ],
    'Neighbours and amenities': [
        'overlooking',
        'overbearing',
        'intrusive',
        'elevated postion',
        'impact upon occupiers neighbouring',
        'losses of privacy',
        'privacy loss',
        'scale, form, massing and proximity to the neighbouring',
        'loss of light',
        'neighbouring residents',
        'amenity enjoyed',
        'paragraphs 127 and 130 of the national planning policy framework',
        'noise disturbance',
        'public amenity',
        'loss of an area of public open space',
        'loss of amenity',
        'noise and disturbance',
        'residential amenity',
        'neighbouring dwellings',
        'no arboricultural evidence',
        'amenity value'
        
    ],
    'Access, Road Safety': [
        'vehicular traffic',
        'lack of accessibility to services',
        'limited visibility',
        'increased risk of collision',
        'highway safety',
        'safe and suitable access',
        'vehicle and pedestrian conflict',
        'poor visibility',
        'accessing and egressing',
        'safe or suitable passing',
        'manoeuvring space',
        'passing space',
        'policy 27',
        'congestion',
        'safe and suitable access to the site for all users',
        'impact on the local road network',
        'no appropriate access',
        'reliant on private motor vehicles'
    ],
    'History and Heritage': [
        'historic mining',
        'eroding its historic character',
        'whs',
        'world heritage site',
        'historic functional relationship',
        'heritage asset',
        'area of great landscape value',
        'historic settlement',
        'record of the morphology',
        'historic impact assessment',
        'authenticity and integrity',
        'designated  assets',
        'listed building',
        'historic interest'
    ],
    'Open Countryside': [
        'open countryside',
        'open-countryside',
        'countryside location',
        'in the countryside',
        'isolated rural dwelling',
        'increase the urban built form',
        'divorced from any settlement',
        'introduction of built form',
        'outside of the defined settlement boundaries',
        'sporadic',
        'unmistakably rural',
        'unsustainable form of development',
        'intrinsic character',
        'unsustainable and undesirable',
        'reliance on private vehicle',
        'not sustainable development',
        'unsustainable traffic movements'
    ],
    'AH': [
        'affordable housing need',
        'need for affordable housing ',
        'not affordable housing led',
        'no affordable housing provision'
    ],
    'Conservation and Biodiversity': [
        'biodiversty',
        'bio-diversity',
        'environmental harm',
        'special area of conservation ',
        ' sac ',
        'bats',
        'reptiles',
        'birds',
        'insects',
        'habitat',
        'species',
        'site of special scientific interest',
        'sssi',
        'protect the natural environment',
        'foraging',
        'wildlife',
        'derogation',
        'ecological'
        'woodland',
        'marsh'
    ],
    'Design': [
        'fail to respect its form',
        'by virtue of its design',
        'traditional pattern of development',
        'by reason of its form',
        'established pattern',
        'existing built form',
        'incongruous',
        'discordant',
        'uncharacteristic',
        'scale and massing',
        'disproportionate',
        'detrimental to the street scene',
        'design guide',
        'poor design',
        'fail to reflect local distinctiveness',
        'fail to integrate',
        'unsympathetic material',
        'cramped',
        'contrived',
        'established character of the area',
        'local character',
        'functional appearance',
        'bulk',
        'character of the building',
        'dominate the street scene'
        
    ],
    'Landscape, Appearance, Character': [
        'area of outstanding natural beauty',
        'heritage coast',
        'scenic',
        'beauty',
        'harm the rural characteristics',
        'visable from the public',
        'landscape character',
        'character and appearance',
        'widely visable',
        'area of great landscape value',
        'aglv',
        'dominant feature',
        'landscape harm',
        'harmful to the character',
        'harmful visual impact',
        'appearance and character',
        'character of the area'
        
    ],
    'Occupancy evidence': [
        'not been used an independant residential',
        'demonstrating independent occupation',
        'not been used for residential purposes',
        'in excess of 10 years',
        'in excess of ten years',
        'in excess of 10yrs',
        'breach of the occupancy condition',
        'been used for domestic purposes' 
    ]
}

In [17]:
# create simple list of all the categories above for headers and to create dfs later
all_cats_list = []
for k,v in example_snippets.items():
    all_cats_list.append(k)

In [18]:
for k,v in example_snippets.items():
    print(k),
    print(v)
    print('\n')

Flooding and Drainage
['flood risk', 'flood zones', 'flood risk assessment', 'not been demonstrated that the site is safe from flooding', 'policy 26', 'other available sites at a lower risk of flooding', 'significant risk of flooding', 'not acceptable in terms of the risk of flooding', 'critical drainage area.', 'functional floodplain', 'would not be appropriately flood resilient', 'would not be safe access or egress in a 1 in 100-year flood event', 'drainage strategy', 'surface water drainage disposal', 'surface water disposal']


Neighbours and amenities
['overlooking', 'overbearing', 'intrusive', 'elevated postion', 'impact upon occupiers neighbouring', 'losses of privacy', 'privacy loss', 'scale, form, massing and proximity to the neighbouring', 'loss of light', 'neighbouring residents', 'amenity enjoyed', 'paragraphs 127 and 130 of the national planning policy framework', 'noise disturbance', 'public amenity', 'loss of an area of public open space', 'loss of amenity', 'noise and d

In [19]:
df['RefusalReasons'] = df['RefusalReasons'].apply(lambda x:x.lower())

In [20]:
# Apply the function to the 'RefusalReasons' column
category_sentences = df['RefusalReasons'].apply(match_categories).tolist()

# create a df from the dictionary category_sentences, that has columns for each cat and rows for each record
category_df = pd.DataFrame(category_sentences)

In [21]:
# Merge the category columns with the original DataFrame
df = pd.concat([df, category_df], axis=1)

In [22]:
# for each category, make the column values string (not object) and remove empty lists
for col in all_cats_list:
    df[col] = df[col].astype(str)
df = df.replace('[]','')

In [23]:
# Count the number of empty strings in each row within the specified columns. strip in case there is whitespace
df['blank_count'] = df[all_cats_list].apply(lambda row: row.str.strip().eq('').sum(), axis=1)

In [24]:
# create a dictionary to hold a df per category, filtering out any with no matches for that category
cat_dict = {}
for col in all_cats_list:
    cat_dict[col] = df.copy()
    cat_dict[col] = cat_dict[col][cat_dict[col][col] != ""]
    print(col)

Flooding and Drainage
Neighbours and amenities
Access, Road Safety
History and Heritage
Open Countryside
AH
Conservation and Biodiversity
Design
Landscape, Appearance, Character
Occupancy evidence


In [25]:
def strip_specific_text(text):
    
    text_to_remove = [
        "Cornwall Local Plan Strategic Policies 2010 - 2030",
        "Cornwall Local Plan Strategic Policies 2010-2030",
        "Cornwall Local Plan Strategic Policies",
        "CLP",
        "Cornwall Local Plan",
        "policies",
        "National Planning Policy Framework",
        "NPPF",
        "paragraph",
        "paragraphs"
    ]
    
    # for each thing you want to remove from the text
    for phrase in text_to_remove:
        # if the lower case version is in the lower case version of the text, replace it with nothing (delete)
        if phrase.lower() in text.lower():
            #print(phrase, " in text")
            text = text.replace(phrase.lower(), "")
    
    return text

In [26]:
def binary_category_values(df):
    
    # simplify the data, showing only 1/0 output if a record is tagged with a category
    # create these as new columns with suffix '_ML'
    for category in all_cats_list:
        #replace any nulls with zero
        df[category +'_ML'] = df[category].fillna(0)
        df[category +'_ML'] = df[category +'_ML'].replace('',0)
        df[category +'_ML'] = df[category +'_ML'].apply(lambda x: 1 if x != 0 else 0)
    
    return df

In [27]:
def lemma(text):
    
    # Process the text
    doc = nlp(text)
    
    # Lemmatize each token and join them back into a string
    text = ' '.join([token.lemma_ for token in doc])
    
    return text

In [28]:
def text_process(text):

    # first make message lower case to make other operations simpler
    text = text.lower()
    
    # remove unwanted/not useful text/phrases from the string
    text = strip_specific_text(text)
    
    # remove any number references
    # create a list of characters if the character is a letter or a space
    text = [char for char in text if char.isalpha() or char == " "]
    # join the characters again with 'nothing' - as spaces are included above
    text = "".join(text)
    
    # Check characters to see if they are in punctuation
    text = [char for char in text if char not in string.punctuation]
    # Join the characters again to form the string without punctuation
    text = ''.join(text)
    
    # convert the msg without punctuation to a string of lemmas
    text = lemma(text)
    
    # Now just remove any stopwords, return as a list to be vectorized
    return [word for word in text.split() if word.lower() not in stopwords.words('english')]

In [29]:
df = binary_category_values(df)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [31]:
allcatslist = []
for col in df.columns:
    if col[-2:] == 'ML':
        allcatslist.append(col)
    
print(allcatslist)

['Flooding and Drainage_ML', 'Neighbours and amenities_ML', 'Access, Road Safety_ML', 'History and Heritage_ML', 'Open Countryside_ML', 'AH_ML', 'Conservation and Biodiversity_ML', 'Design_ML', 'Landscape, Appearance, Character_ML', 'Occupancy evidence_ML']


In [33]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import ClassifierChain

classification_dict = {}

# Split the data into training and testing sets
msg_train, msg_test, label_train, label_test = train_test_split(df['RefusalReasons'], df[allcatslist], test_size=0.2)

# Pipeline takes a list of tuples (name of the step, what it's doing)
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('clf', ClassifierChain(SVC(probability=True))),  # Apply Binary Relevance with SVM classifier
])

# Fit the data to the model
pipeline.fit(msg_train, label_train)

# Predict probabilities for the test data
probs = pipeline.predict_proba(msg_test)

# Adjust the threshold to control classification outcome
threshold = 0.5  # Adjust the threshold as desired
preds = (probs >= threshold).astype(int)

# Compute precision, recall, and other metrics
report = classification_report(label_test, preds, target_names=allcatslist, output_dict=True)

# Make a dataframe from the dictionary
df_report = pd.DataFrame(report)

# Add the report into a dictionary of reports
#classification_dict[category] = df_report


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
df_report

Unnamed: 0,Flooding and Drainage_ML,Neighbours and amenities_ML,"Access, Road Safety_ML",History and Heritage_ML,Open Countryside_ML,AH_ML,Conservation and Biodiversity_ML,Design_ML,"Landscape, Appearance, Character_ML",Occupancy evidence_ML,micro avg,macro avg,weighted avg,samples avg
precision,1.0,0.84,0.95,0.814286,0.874126,0.666667,0.836735,0.814815,0.878307,0.8,0.862016,0.847493,0.860865,0.738956
recall,0.9375,0.724138,0.974359,0.890625,0.925926,0.5,0.836735,0.66,0.798077,0.666667,0.818851,0.791403,0.818851,0.713494
f1-score,0.967742,0.777778,0.962025,0.850746,0.899281,0.571429,0.836735,0.729282,0.836272,0.727273,0.839879,0.815856,0.837241,0.708558
support,16.0,58.0,39.0,64.0,135.0,4.0,49.0,100.0,208.0,6.0,679.0,679.0,679.0,679.0


In [None]:
"""
precision == abilty to detect true positives (ratio of true postives to true postitives + false positives)
recall == also ability to detect true positives, but is ratio of true positives to true positives + false positives)
F1 Score: Harmonic mean of precision and recall. 1 == best possible score
support == number of instances of the category found in the test dataset, according to the model alone.
support does not have anything to do with the original labels.
"""

In [57]:
def export_excel(df, filename, sheet_name=None):

    from pathlib import Path
    path = "/Users/GlassShark1/Python/Refusals Data/"
    filepath = Path(path + filename + ".xlsx")
    print(filepath)

    if sheet_name is None:
        df.to_excel(filepath, index=False)
    else:
        df.to_excel(filepath, index=False, sheet_name=sheet_name)



In [58]:
export_excel(df, "Verbatim_Ref_Reasons_280523", sheet_name="Data")
export_excel(df_report,"VRR_class_rpt_280523", sheet_name="class_rpt")

/Users/GlassShark1/Python/Refusals Data/Verbatim_Ref_Reasons_280523.xlsx
/Users/GlassShark1/Python/Refusals Data/VRR_class_rpt_280523.xlsx


In [None]:
"""


LEMMA VERSION STARTS HERE


"""

In [None]:
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher

# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Load the dataframe from an Excel file. For now, keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")
df = og_df.copy()

example_snippets = {
    'Flooding and Drainage': [
        [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'IS_ALPHA': True, 'OP': '*'}, {'LOWER': 'risk', 'LEMMA': 'risk'}],
        [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'zone', 'LEMMA': 'zone'}],
        [{'LOWER': 'demonstrate', 'LEMMA': 'demonstrate'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '26'}],
        [{'LOWER': 'available', 'LEMMA': 'available'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'low', 'LEMMA': 'low'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'significant', 'LEMMA': 'significant'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'not'}, {'LOWER': 'acceptable', 'LEMMA': 'acceptable'}, {'LOWER': 'term', 'LEMMA': 'term'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'critical', 'LEMMA': 'critical'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'floodplain', 'LEMMA': 'floodplain'}],
        [{'LOWER': 'would'}, {'LOWER': 'not'}, {'LOWER': 'appropriately', 'LEMMA': 'appropriately'}, {'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'resilient', 'LEMMA': 'resilient'}],
        [{'LOWER': 'would'}, {'LOWER': 'not'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'access', 'LEMMA': 'access'}, {'LOWER': 'egress', 'LEMMA': 'egress'}, {'LOWER': '1'}, {'LOWER': '100'}, {'LOWER': 'year'}, {'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'event', 'LEMMA': 'event'}],
        [{'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'strategy', 'LEMMA': 'strategy'}],
        [{'LOWER': 'surface', 'LEMMA': 'surface'}, {'LOWER': 'water', 'LEMMA': 'water'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'disposal', 'LEMMA': 'disposal'}],
        [{'LOWER': 'surface', 'LEMMA': 'surface'}, {'LOWER': 'water', 'LEMMA': 'water'}, {'LOWER': 'disposal', 'LEMMA': 'disposal'}]
    ],
    'Neighbours and amenities': [
        [{'LOWER': 'overlooking', 'LEMMA': 'overlook'}],
        [{'LOWER': 'overbearing', 'LEMMA': 'overbear'}],
        [{'LOWER': 'intrusive', 'LEMMA': 'intrusive'}],
        [{'LOWER': 'elevated', 'LEMMA': 'elevate'}, {'LOWER': 'position', 'LEMMA': 'position'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'occupier', 'LEMMA': 'occupier'}, {'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}],
        [{'LOWER': 'loss', 'LEMMA': 'loss'}, {'LOWER': 'privacy', 'LEMMA': 'privacy'}],
        [{'LOWER': 'privacy', 'LEMMA': 'privacy'}, {'LOWER': 'loss', 'LEMMA': 'loss'}],
        [{'LOWER': 'scale', 'LEMMA': 'scale'}, {'LOWER': 'form', 'LEMMA': 'form'}, {'LOWER': 'massing', 'LEMMA': 'mass'}, {'LOWER': 'proximity', 'LEMMA': 'proximity'}, {'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}],
        [{'LOWER': 'loss', 'LEMMA': 'loss'}, {'LOWER': 'light', 'LEMMA': 'light'}],
        [{'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}, {'LOWER': 'resident', 'LEMMA': 'resident'}],
        [{'LOWER': 'amenity', 'LEMMA': 'amenity'}, {'LOWER': 'enjoy', 'LEMMA': 'enjoy'}],
        [{'LOWER': 'paragraph', 'LEMMA': 'paragraph'}, {'LOWER': '127'}, {'LOWER': '130'}, {'LOWER': 'national', 'LEMMA': 'national'}, {'LOWER': 'planning', 'LEMMA': 'plan'}, {'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': 'framework', 'LEMMA': 'framework'}],
        [{'LOWER': 'noise', 'LEMMA': 'noise'}, {'LOWER': 'disturbance', 'LEMMA': 'disturbance'}]
    ],
    'No appropriate access': [
        [{'LOWER': 'vehicular', 'LEMMA': 'vehicular'}, {'LOWER': 'traffic', 'LEMMA': 'traffic'}],
        [{'LOWER': 'accessibility', 'LEMMA': 'accessibility'}, {'LOWER': 'service', 'LEMMA': 'service'}],
        [{'LOWER': 'limited', 'LEMMA': 'limited'}, {'LOWER': 'visibility', 'LEMMA': 'visibility'}],
        [{'LOWER': 'increased', 'LEMMA': 'increase'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'collision', 'LEMMA': 'collision'}],
        [{'LOWER': 'highway', 'LEMMA': 'highway'}, {'LOWER': 'safety', 'LEMMA': 'safety'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'access', 'LEMMA': 'access'}],
        [{'LOWER': 'vehicle', 'LEMMA': 'vehicle'}, {'LOWER': 'pedestrian', 'LEMMA': 'pedestrian'}, {'LOWER': 'conflict', 'LEMMA': 'conflict'}],
        [{'LOWER': 'poor', 'LEMMA': 'poor'}, {'LOWER': 'visibility', 'LEMMA': 'visibility'}],
        [{'LOWER': 'accessing', 'LEMMA': 'access'}, {'LOWER': 'egressing', 'LEMMA': 'egress'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'pass', 'LEMMA': 'pass'}],
        [{'LOWER': 'manoeuvring', 'LEMMA': 'manoeuvre'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'passing', 'LEMMA': 'pass'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '27'}],
        [{'LOWER': 'congestion', 'LEMMA': 'congestion'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'access', 'LEMMA': 'access'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'user', 'LEMMA': 'user'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'road', 'LEMMA': 'road'}, {'LOWER': 'network', 'LEMMA': 'network'}]
    ],
    'History and Heritage': [
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'mining', 'LEMMA': 'mining'}],
        [{'LOWER': 'erode', 'LEMMA': 'erode'}, {'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'WHS', 'LEMMA': 'WHS'}, {'LOWER': 'world', 'LEMMA': 'world'}, {'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'relationship', 'LEMMA': 'relationship'}],
        [{'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'asset', 'LEMMA': 'asset'}],
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'great', 'LEMMA': 'great'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'value', 'LEMMA': 'value'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'settlement', 'LEMMA': 'settlement'}],
        [{'LOWER': 'record', 'LEMMA': 'record'}, {'LOWER': 'morphology', 'LEMMA': 'morphology'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'assessment', 'LEMMA': 'assessment'}],
        [{'LOWER': 'authenticity', 'LEMMA': 'authenticity'}, {'LOWER': 'integrity', 'LEMMA': 'integrity'}],
        [{'LOWER': 'designate', 'LEMMA': 'designate'}, {'LOWER': 'asset', 'LEMMA': 'asset'}],
        [{'LOWER': 'list', 'LEMMA': 'list'}, {'LOWER': 'building', 'LEMMA': 'building'}]
    ],
    'Open Countryside': [
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'countryside', 'LEMMA': 'countryside'}],
        [{'LOWER': 'increase', 'LEMMA': 'increase'}, {'LOWER': 'encroachment', 'LEMMA': 'encroachment'}],
        [{'LOWER': 'scenic', 'LEMMA': 'scenic'}, {'LOWER': 'beauty', 'LEMMA': 'beauty'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'green', 'LEMMA': 'green'}, {'LOWER': 'wedge', 'LEMMA': 'wedge'}],
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'restrain', 'LEMMA': 'restrain'}],
        [{'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'feature', 'LEMMA': 'feature'}],
        [{'LOWER': 'visual', 'LEMMA': 'visual'}, {'LOWER': 'amenity', 'LEMMA': 'amenity'}],
        [{'LOWER': 'promote', 'LEMMA': 'promote'}, {'LOWER': 'recreation', 'LEMMA': 'recreation'}, {'LOWER': 'amenity', 'LEMMA': 'amenity'}],
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '35'}],
        [{'LOWER': 'valued', 'LEMMA': 'value'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'feature', 'LEMMA': 'feature'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'area', 'LEMMA': 'area'}]
    ],
        'AH': [
        [{'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'need', 'LEMMA': 'need'}],
        [{'LOWER': 'need', 'LEMMA': 'need'}, {'LOWER': 'for', 'LEMMA': 'for'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}],
        [{'LOWER': 'no', 'LEMMA': 'no'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'provision', 'LEMMA': 'provision'}],
        [{'LOWER': 'not', 'LEMMA': 'not'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'led', 'LEMMA': 'led'}]
    ],
    'Conservation and Biodiversity': [
        [{'LOWER': 'biodiversty', 'LEMMA': 'biodiversity'}],
        [{'LOWER': 'special', 'LEMMA': 'special'}, {'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'conservation', 'LEMMA': 'conservation'}],
        [{'LOWER': 'SAC', 'LEMMA': 'SAC'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'bat', 'LEMMA': 'bat'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'reptile', 'LEMMA': 'reptile'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'bird', 'LEMMA': 'bird'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'insect', 'LEMMA': 'insect'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'habitat', 'LEMMA': 'habitat'}],
        [{'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'special', 'LEMMA': 'special'}, {'LOWER': 'scientific', 'LEMMA': 'scientific'}, {'LOWER': 'interest', 'LEMMA': 'interest'}],
        [{'LOWER': 'SSSI', 'LEMMA': 'SSSI'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'protect', 'LEMMA': 'protect'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'environment', 'LEMMA': 'environment'}],
        [{'LOWER': 'ecological', 'LEMMA': 'ecological'}, {'LOWER': 'woodland', 'LEMMA': 'woodland'}],
        [{'LOWER': 'foraging', 'LEMMA': 'foraging'}],
        [{'LOWER': 'marsh', 'LEMMA': 'marsh'}],
        [{'LOWER': 'wildlife', 'LEMMA': 'wildlife'}],
        [{'LOWER': 'derogation', 'LEMMA': 'derogation'}]
    ],
    'Design': [
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'respect', 'LEMMA': 'respect'}, {'LOWER': 'its', 'LEMMA': 'its'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'traditional', 'LEMMA': 'traditional'}, {'LOWER': 'pattern', 'LEMMA': 'pattern'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'development', 'LEMMA': 'development'}],
        [{'LOWER': 'by', 'LEMMA': 'by'}, {'LOWER': 'reason', 'LEMMA': 'reason'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'its', 'LEMMA': 'its'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'established', 'LEMMA': 'established'}, {'LOWER': 'pattern', 'LEMMA': 'pattern'}],
        [{'LOWER': 'existing', 'LEMMA': 'existing'}, {'LOWER': 'built', 'LEMMA': 'built'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'incongruous', 'LEMMA': 'incongruous'}],
        [{'LOWER': 'discordant', 'LEMMA': 'discordant'}],
        [{'LOWER': 'uncharacteristic', 'LEMMA': 'uncharacteristic'}],
        [{'LOWER': 'scale', 'LEMMA': 'scale'}, {'LOWER': 'and', 'LEMMA': 'and'}, {'LOWER': 'massing', 'LEMMA': 'massing'}],
        [{'LOWER': 'disproportionate', 'LEMMA': 'disproportionate'}],
        [{'LOWER': 'detrimental', 'LEMMA': 'detrimental'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'street', 'LEMMA': 'street'}, {'LOWER': 'scene', 'LEMMA': 'scene'}],
        [{'LOWER': 'design', 'LEMMA': 'design'}, {'LOWER': 'guide', 'LEMMA': 'guide'}],
        [{'LOWER': 'poor', 'LEMMA': 'poor'}, {'LOWER': 'design', 'LEMMA': 'design'}],
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'reflect', 'LEMMA': 'reflect'}, {'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'distinctiveness', 'LEMMA': 'distinctiveness'}],
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'integrate', 'LEMMA': 'integrate'}],
        [{'LOWER': 'cramped', 'LEMMA': 'cramped'}],
        [{'LOWER': 'contrived', 'LEMMA': 'contrived'}],
        [{'LOWER': 'established', 'LEMMA': 'established'}, {'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'unsympathetic', 'LEMMA': 'unsympathetic'},{'LOWER': 'material', 'LEMMA': 'material'}],
        [{'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'appearance', 'LEMMA': 'appearance'}],
        [{'LOWER': 'bulk', 'LEMMA': 'bulk'}]
    ],
    'Landscape and Appearance': [
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'outstanding', 'LEMMA': 'outstanding'}, {'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'beauty', 'LEMMA': 'beauty'}],
        [{'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'coast', 'LEMMA': 'coast'}],
        [{'LOWER': 'scenic', 'LEMMA': 'scenic'}],
        [{'LOWER': 'beauty', 'LEMMA': 'beauty'}],
        [{'LOWER': 'harm', 'LEMMA': 'harm'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'rural', 'LEMMA': 'rural'}, {'LOWER': 'characteristics', 'LEMMA': 'characteristic'}],
        [{'LOWER': 'visable', 'LEMMA': 'visible'}, {'LOWER': 'from', 'LEMMA': 'from'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'public', 'LEMMA': 'public'}],
        [{'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'and', 'LEMMA': 'and'}, {'LOWER': 'appearance', 'LEMMA': 'appearance'}],
        [{'LOWER': 'widely', 'LEMMA': 'widely'}, {'LOWER': 'visible', 'LEMMA': 'visible'}],
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'great', 'LEMMA': 'great'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'value', 'LEMMA': 'value'}],
        [{'LOWER': 'AGLV', 'LEMMA': 'AGLV'}]
    ]
}

# Create a simple list of all the categories above for headers and to create dfs later
all_cats_list = list(example_snippets.keys())

# Define a function to match categories in the text and create category columns
def match_categories(text):
    # Initializes a PhraseMatcher object named matcher using the vocabulary of the nlp model
    matcher = PhraseMatcher(nlp.vocab)
    # Empty dictionary to store the matched category sentences
    category_sentences = {}

    # Iterate over each category and its examples
    for category, examples in example_snippets.items():
        # Convert examples into spaCy patterns
        category_patterns = [nlp.tokenizer(' '.join([t.get('LOWER', '') for t in example])) for example in examples]

        # Add the patterns to the matcher object
        matcher.add(category, None, *category_patterns)
        # Initialize an empty list for each category in the category_sentences dictionary
        category_sentences[category] = []

    # Process the input text using the nlp model, creating a Doc object named doc
    doc = nlp(text)
    # Match the patterns in the matcher object against the doc, obtain matches in the matches variable
    matches = matcher(doc)

    # Iterate over each match and retrieve the matched category and sentence
    for match_id, start, end in matches:
        matched_category = matcher.vocab.strings[match_id]
        matched_sentence = doc[start:end].text
        # Append the matched sentence to the corresponding category in the category_sentences dictionary
        category_sentences[matched_category].append(matched_sentence)

    return category_sentences

# Apply the function to the 'RefusalReasons' column
category_sentences = df['RefusalReasons'].apply(match_categories).tolist()

category_df = pd.DataFrame(category_sentences)

# Merge the category columns with the original DataFrame
df = pd.concat([df, category_df], axis=1)


In [None]:
for category, examples in example_snippets.items():
    print(category)

In [None]:
cat_dict = {}

In [None]:
df.head()

In [None]:
# 
for col in all_cats_list:
    df[col] = df[col].astype(str)
df = df.replace('[]','')

In [None]:
for col in all_cats_list:
    cat_dict[col] = df.copy()
    cat_dict[col] = cat_dict[col][cat_dict[col][col] != ""]
    print(col)

In [None]:
for reason in cat_dict['Flooding and Drainage']['Flooding and Drainage']:
    print(cat_dict['Flooding and Drainage']['RefusalReasons'])
    print('\n')

In [None]:
len(cat_dict['Flooding and Drainage'])

In [None]:
cat_dict['Flooding and Drainage']['RefusalReasons'][1019]