In [1]:
from nltk.corpus import stopwords # stopwords (most common words that don't mean much)
import spacy
import pandas as pd
import re
import string
from spacy.matcher import PhraseMatcher


# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")
df = og_df.copy()

In [2]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000
    
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [3]:
def cleaning_text(text):
    
    # replaces instances of more than 1 space with a single space
    text = re.sub(' +', ' ', text)
    # strips any unwanted whitespace from either end
    text = text.rstrip().lstrip()
    
    return text

In [4]:
def saints_rule(text):
    # removes full stop in St. before extracting parish names so it is not read as a full stop
    if 'St. ' in text:
        text.replace('St. ', 'St ')
    #specific case where the word 'form' skews results, but we don't want to lose 'information'
    if ' form' in text:
        text.replace( ' form', ' ')
    return text

In [5]:
# simple func to return length. Used to apply to a list of values
def leng_func(x):
    return len(x)

In [6]:
# function that takes a set of values to be replaced with a single value
def replace_text(text, string_set, returnstr):
    for string in string_set:
        text = text.replace(string, returnstr)
    return text

In [7]:
def drop_col_by_labels(df,column_names):
    for col in column_names:
        if col in df.columns:
            df = df.drop(col, axis=1)
    return df

In [8]:
def Perm_regex(text):
    
    # Identify permission references
    matchtype = r'(?i)(?:PA)?\d{2}[/|_]\d{5}' # e.g. PA12/12345
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
    
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)

    return matches

In [9]:
def CLP_regex(text):
    matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP) (?:\d{4}\s*-\s*\d{4}|\d{4}?)?'
    #matchtype = r'(Cornwall Local Plan Strategic Policies|Cornwall Local Plan|CLP)\s*((\(\d{4}(?:\s*-\s*\d{4})?\))|\d{4}\s*-\s*\d{4}|\d{4})?'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [10]:
def AONB_regex(text):
    matchtype = r'(?i)\barea\s*of\s*outstanding\s*natural\s*beauty\b|\bAONB\b|\(AONB\)'
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [11]:
def NPD_regex(text):
    matchtype = r'\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:N(?:eighbourhood)?\s*(?:D(?:evelopment)?\s*)?P(?:lan)?|NDP)\b(?:\s+\([^\)]+\))?(?:\s+\d{4}(?:\s*(?:to|-)\s*\d{4})?)?'    
    matches = re.findall(matchtype, text)
    
    # sort the list by highest number of characters first. This means CLPSP will be replaced before CLP, 
    # so you aren't left with instances of SP on their own
    matches.sort(reverse = True,key=leng_func)
    
    # Don't return anything if it's an empty list / no matches
    if matches ==[]:
        matches = ""
        
    # remove dups by set and turn into a single comma seperated string
    matches = set(matches)
    matches = ",".join(matches)
        
    return matches

In [12]:
# Define a function to match categories in the text and create category columns
def match_categories(text):
    
    # initializes a PhraseMatcher object named matcher using the vocabulary of the nlp model
    matcher = PhraseMatcher(nlp.vocab)
    # empty dictionary to store the matched category sentences
    category_sentences = {}

    # for each category and it's examples - this bit is about getting the matcher to understand your patterns
    # and store them efficiently. It is an instance of the PhraseMatcher class
    for category, examples in example_snippets.items():
        # converts into spaCy Doc objects
        category_patterns = [nlp(example) for example in examples]
        # adds them to the matcher object
        matcher.add(category, None, *category_patterns)
        # initializes empty list for each category in the category_sentences dictionary
        category_sentences[category] = []

    # processes the input text using the nlp model, creating a Doc object named doc
    doc = nlp(text)
    # matches the patterns in the matcher object against the doc, obtains matches in the matches variable
    matches = matcher(doc)

    # iterates over each match and retrieves the matched category and sentence
    for match_id, start, end in matches:
        matched_category = matcher.vocab.strings[match_id]
        matched_sentence = doc[start:end].text
        #  appends matched sentence to the corresponding category in the category_sentences dictionary
        category_sentences[matched_category].append(matched_sentence)
    
    return category_sentences

In [13]:
# remove spaces (spell check TBD)
df['RefusalReasons'] = df['RefusalReasons'].apply(cleaning_text)

In [14]:
# Tidying up refusalreasons text and extracting key info in columns


# get rid of confusing 'St.' strings
df["RefusalReasons"] = df["RefusalReasons"].apply(saints_rule)
# create a column containing a string of references to NPDs
df['NDPs Referenced'] = df["RefusalReasons"].apply(NPD_regex)
# create a column containing a string of references to CLP
df['CLPs Referenced'] = df["RefusalReasons"].apply(CLP_regex)
# create a column containing a string of permission references
df['Perms Referenced'] = df["RefusalReasons"].apply(Perm_regex)
# create a column containing a string of AONB references
df['AONB Referenced'] = df["RefusalReasons"].apply(AONB_regex)
# Tidy the various AONB references to return only 'AONB'
# replace null values with empty string
df['AONB Referenced'].fillna('', inplace=True)
# update non-null values with 'AONB'
df.loc[df['AONB Referenced'] != '', 'AONB Referenced'] = 'AONB'
# replace all iterations of AONB with 'AONB' in the main data
#df['RefusalReasons'] = df['RefusalReasons'].apply(replace_text, args=(all_AONB_list, 'AONB'))
# drop unused columns
df = drop_col_by_labels(df,['Unnamed: 0','Missing Dev_Type'])

In [39]:
'''First cut the source data down by exluding categories that don't share common refusal reasons - e.g.
very specific types of development

# maybe exclude:
# CLUED - Certificate of Lawful Development - these generally refer to whether the use of something
# is lawful or not, so typically refusal reasons differ here
# TPO/TPA - Tree Protection  - generally refusals are for public amenity reasons
#'Listed Building Consent (alter/extend)' - borderline one because it's to do with alterations/extensions

exclude_cats = ['CLEUD/CLOPED','TPO applications','Not Required - Hedgerow Removal','TCA applications']
# Prior Approval - AF2 type == agricultural stuff?
borderline_cats = ['Listed Building Consent (alter/extend)','Prior Approval - AF2 type','Notification - Rail type']

# filter categories to exclude from the df. '~' negates/gives you the opposite of something
df = df[~df['Dev_Type'].isin(exclude_cats)]

# as you've removed entries from the index, they will be out of order, reset to fix
# also required when applying boolean masks later, or when merging on results by index later
df = df.reset_index()

# intital text stripping, including phrases as opposed words and common sentences
# plus stripping of perm references etc, plus stripping of lemmatized versions of text_to_remove
df['cleaned'] = df['RefusalReasons'].apply(strip_specific_text)

df['cleaned'] = df['cleaned'].apply(strip_numbers)

df['cleaned'] = df['cleaned'].apply(stopwords_and_preprocess)

# 'cleaned' is an object datatype, cast as str to allow you to search for substrings for refinement
df['checking col'] = df['cleaned'].astype(str)'''


"First cut the source data down by exluding categories that don't share common refusal reasons - e.g.\nvery specific types of development\n\n# maybe exclude:\n# CLUED - Certificate of Lawful Development - these generally refer to whether the use of something\n# is lawful or not, so typically refusal reasons differ here\n# TPO/TPA - Tree Protection  - generally refusals are for public amenity reasons\n#'Listed Building Consent (alter/extend)' - borderline one because it's to do with alterations/extensions\n\nexclude_cats = ['CLEUD/CLOPED','TPO applications','Not Required - Hedgerow Removal','TCA applications']\n# Prior Approval - AF2 type == agricultural stuff?\nborderline_cats = ['Listed Building Consent (alter/extend)','Prior Approval - AF2 type','Notification - Rail type']\n\n# filter categories to exclude from the df. '~' negates/gives you the opposite of something\ndf = df[~df['Dev_Type'].isin(exclude_cats)]\n\n# as you've removed entries from the index, they will be out of order, 

In [19]:
# Dictionary, each category is the key, each key has a list of snippets
example_snippets = {
    'Flooding and Drainage': [
        'flood risk',
        'flood zones',
        'flood risk assessment',
        'not been demonstrated that the site is safe from flooding',
        'policy 26',
        'other available sites at a lower risk of flooding',
        'significant risk of flooding',
        'not acceptable in terms of the risk of flooding',
        'critical drainage area.',
        'functional floodplain',
        'would not be appropriately flood resilient',
        'would not be safe access or egress in a 1 in 100-year flood event',
        'drainage strategy',
        'surface water drainage disposal',
        'surface water disposal',
        'flood zone',
        'flood defense',
        'flood',
        'surface water',
        'drainage'
    ],
    'Neighbours and amenities': [
        'overlooking',
        'overbearing',
        'intrusive',
        'elevated postion',
        'impact upon occupiers neighbouring',
        'losses of privacy',
        'privacy loss',
        'scale, form, massing and proximity to the neighbouring',
        'loss of light',
        'neighbouring residents',
        'amenity enjoyed',
        'paragraphs 127 and 130 of the national planning policy framework',
        'noise disturbance',
        'public amenity',
        'loss of an area of public open space',
        'loss of amenity',
        'noise and disturbance',
        'residential amenity',
        'neighbouring dwellings',
        'no arboricultural evidence',
        'amenity value',
        'visual amenities',
        'loss of privacy',
        'privacy of the occupiers',
        'neighbouring properties',
        'proximity to the neighbouring',
        'loss of outlook',
        'neighbouring property',
        'impacts to any occupiers',
        'degrade this amenity',
        'degrade the amenity',
        'proximity to residential',
        'increases to noise'
        
    ],
    'Access, Road Safety': [
        'vehicular traffic',
        'lack of accessibility to services',
        'limited visibility',
        'increased risk of collision',
        'highway safety',
        'safe and suitable access',
        'vehicle and pedestrian conflict',
        'poor visibility',
        'accessing and egressing',
        'safe or suitable passing',
        'manoeuvring space',
        'passing space',
        'policy 27',
        'congestion',
        'safe and suitable access to the site for all users',
        'impact on the local road network',
        'no appropriate access',
        'reliant on private motor vehicles',
        'collision',
        'blind reversing',
        'drivers exiting',
        'vehicular movements',
        'manoeuvrability issue',
        'manoeuvrability issues',
        'proposed access',
        'undesirable access',
        'sustainable modes of travel',
        'proposed parking',
        'narrow and restricted',
        'poor accessibility',
        'narrow and winding',
        'conflict between users',
        'oncoming traffic',
        'passing spaces',
        'increase in vehicular',
        'vehicle movements',
        'vehicle and pedestrian',
        'vehicles leaving'
    ],
    'History and Heritage': [
        'historic mining',
        'eroding its historic character',
        'whs',
        'world heritage site',
        'historic functional relationship',
        'heritage asset',
        'area of great landscape value',
        'historic settlement',
        'record of the morphology',
        'historic impact assessment',
        'authenticity and integrity',
        'designated  assets',
        'listed building',
        'historic interest',
        'archaeological',
        'buried',
        'heritage assets',
        'oustanding universal value',
        'historical character',
        'significance of the historic',
        'irreplacable building',
        'integrity and authenticity',
        'hertitage values',
        'historic land use',
        'ancient monuments',
        'battlefields',
        'listed buildings',
        'loss of historic',
        'grade ii',
        'historic setting',
        'historic environment',
        'historic building',
        'non-designated'
    ],
    'Open Countryside': [
        'open countryside',
        'open-countryside',
        'countryside location',
        'in the countryside',
        'isolated rural dwelling',
        'increase the urban built form',
        'divorced from any settlement',
        'introduction of built form',
        'outside of the defined settlement boundaries',
        'sporadic',
        'unmistakably rural',
        'unsustainable form of development',
        'unsustainable and undesirable',
        'reliance on private vehicle',
        'unsustainable traffic movements',
        'AONB',
        'outside the settlement',
        'within the countryside',
        'outside of the settlement',
        'intrusion of the built form',
        'divorced from any established',
        'divorced from any settlement',
        'into the countryside',
        'infill or rounding off',
        'encroachment of the built form',
        'outside of the development boundary'
    ],
    'AH': [
        'affordable housing need',
        'need for affordable housing ',
        'not affordable housing led',
        'no affordable housing provision',
        'not considered to be affordable housing led',
        'not affordable',
        'insufficient financial viability',
        'fail to be affordable housing led',
        'affordability of the properties',
        'affordable housing thresholds',
        'fail to deliver an affordable housing',
        'fail to deliver affordable housing',
        'not an affordable housing led scheme'
    ],
    'Conservation and Biodiversity': [
        'biodiversty',
        'bio-diversity',
        'environmental harm',
        'special area of conservation ',
        ' sac ',
        'bats',
        'reptiles',
        'birds',
        'insects',
        'habitat',
        'species',
        'site of special scientific interest',
        'sssi',
        'protect the natural environment',
        'foraging',
        'wildlife',
        'derogation',
        'ecological'
        'woodland',
        'marsh',
        'biodiversity',
        'ecology'
    ],
    'Design': [
        'fail to respect its form',
        'by virtue of its design',
        'traditional pattern of development',
        'by reason of its form',
        'established pattern',
        'incongruous',
        'uncharacteristic',
        'scale and massing',
        'disproportionate',
        'detrimental to the street scene',
        'design guide',
        'poor design',
        'fail to reflect local distinctiveness',
        'fail to integrate',
        'unsympathetic material',
        'contrived',
        'established character of the area',
        'functional appearance',
        'bulk',
        'character of the building',
        'dominate the street scene',
        'scale, massing',
        'poor quality',
        'height and massing',
        'proportions',
        'by reason of its design',
        'of detriment to the quality',
        'sub-standard',
        'traditional morphology',
        'not considered to be suitably constructed',
        'overall design',
        'form, scale',
        'scale and materials',
        'not promote local distinctiveness',
        'height and mass',
        'appropriate scale',
        'modern in appearance',
        'out of keeping',
        'unsympathetic',
        'contemporary design',
        'scale, mass',
        'confused identity',
        'too large',
        'due to its scale',
        'local distinctiveness'
        
    ],
    'Landscape, Appearance, Character': [
        'area of outstanding natural beauty',
        'heritage coast',
        'scenic',
        'beauty',
        'harm the rural characteristics',
        'visable from the public',
        'landscape character',
        'character and appearance',
        'widely visable',
        'area of great landscape value',
        'aglv',
        'dominant feature',
        'landscape harm',
        'harmful to the character',
        'harmful visual impact',
        'appearance and character',
        'character of the area',
        'intrinsic rural character',
        'rural landscape character',
        'erosion of the rural character',
        'detriment of the character',
        'local distinctiveness',
        'understanding of the setting',
        'local character'
        
    ],
    'Occupancy evidence': [
        'not been used an independant residential',
        'demonstrating independent occupation',
        'not been used for residential purposes',
        'in excess of 10 years',
        'in excess of ten years',
        'in excess of 10yrs',
        'breach of the occupancy condition',
        'been used for domestic purposes',
        'not a registered address',
        'continuous period in excess',
        'certificate of lawfulness',
        'onus of proof',
        'lawful development certificate',
        'veracity of the application',
        'continuous residential use',
        'sufficiently precise and unambiguous'
    ]
}

In [20]:
# create simple list of all the categories above for headers and to create dfs later
all_cats_list = []
for k,v in example_snippets.items():
    all_cats_list.append(k)

In [21]:
for k,v in example_snippets.items():
    print(k),
    print(v)
    print('\n')

Flooding and Drainage
['flood risk', 'flood zones', 'flood risk assessment', 'not been demonstrated that the site is safe from flooding', 'policy 26', 'other available sites at a lower risk of flooding', 'significant risk of flooding', 'not acceptable in terms of the risk of flooding', 'critical drainage area.', 'functional floodplain', 'would not be appropriately flood resilient', 'would not be safe access or egress in a 1 in 100-year flood event', 'drainage strategy', 'surface water drainage disposal', 'surface water disposal', 'flood zone', 'flood defense', 'flood', 'surface water', 'drainage']


Neighbours and amenities
['overlooking', 'overbearing', 'intrusive', 'elevated postion', 'impact upon occupiers neighbouring', 'losses of privacy', 'privacy loss', 'scale, form, massing and proximity to the neighbouring', 'loss of light', 'neighbouring residents', 'amenity enjoyed', 'paragraphs 127 and 130 of the national planning policy framework', 'noise disturbance', 'public amenity', 'l

In [22]:
df['RefusalReasons'] = df['RefusalReasons'].apply(lambda x:x.lower())

In [23]:
# Apply the function to the 'RefusalReasons' column
category_sentences = df['RefusalReasons'].apply(match_categories).tolist()

# create a df from the dictionary category_sentences, that has columns for each cat and rows for each record
category_df = pd.DataFrame(category_sentences)

In [24]:
# Merge the category columns with the original DataFrame
df = pd.concat([df, category_df], axis=1)

In [25]:
# for each category, make the column values string (not object) and remove empty lists
for col in all_cats_list:
    df[col] = df[col].astype(str)
df = df.replace('[]','')

In [26]:
# Count the number of empty strings in each row within the specified columns. strip in case there is whitespace
df['blank_count'] = df[all_cats_list].apply(lambda row: row.str.strip().eq('').sum(), axis=1)

In [27]:
# create a dictionary to hold a df per category, filtering out any with no matches for that category
cat_dict = {}
for col in all_cats_list:
    cat_dict[col] = df.copy()
    cat_dict[col] = cat_dict[col][cat_dict[col][col] != ""]
    print(col)

Flooding and Drainage
Neighbours and amenities
Access, Road Safety
History and Heritage
Open Countryside
AH
Conservation and Biodiversity
Design
Landscape, Appearance, Character
Occupancy evidence


In [28]:
def strip_specific_text(text):
    
    text_to_remove = [
        "Cornwall Local Plan Strategic Policies 2010 - 2030",
        "Cornwall Local Plan Strategic Policies 2010-2030",
        "Cornwall Local Plan Strategic Policies",
        "CLP",
        "Cornwall Local Plan",
        "policies",
        "National Planning Policy Framework",
        "NPPF",
        "paragraph",
        "paragraphs"
    ]
    
    # keep a list of specific words/phrases you want to remove so as to focus on topic modelling
    # World Heritage Site (WHS), Special Area of Conservation (SAC), Area of Great Landscape Value


    text_to_remove = [
            'The Local Planning Authority has acted positively and proactively in determining this application by identifying matters of concern with this proposal.',
            ' On this occasion, the issues are so fundamental that it is not possible to negotiate a satisfactory way forward due to the harm that has been clearly identified within the reason(s) for refusal.',
            'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
            "National Planning Policy Framework","NPPF",
            "building", "built",
            "special justification",
            "demonstrate",
            "house", "permission", "condition",
            "section",
            "work",
            "intentions",
            "adopted",
            "adoption",
            "represent",
            "representing",
            "settlement",
            "planning",
            "Cornwall",
            "local",
            "plan",
            "development",
            "policies",
            "contrary",
            "proposal",
            "application",
            "policy",
            "policies",
            "paragraph","paragraphs",
            "permitted",
            "development",
            "area",
            "dwelling",
            "proposed",
            "national",
            "house",
            "housing",
            "justify",
            "town",
            "constitute",
            "sited",
            "siting",
            "guidance",
            "benefit",
            "justification",
            "point",
            "raise",
            "factor",
            "balance",
            "house",
            "identify",
            "fall",
            "land ",
            "provide",
            "circumstances",
            "considerable",
            "tree",
            "resulting",
            "site", 
            "result", 
            "reason",
            "Caradon", "consider", "conflict", 'aim', 'affect', 'impact', 'information','form'
            #"form" can't have form because of "information"
            ]
    # for each thing you want to remove from the text
    for phrase in text_to_remove:
        # if the lower case version is in the lower case version of the text, replace it with nothing (delete)
        if phrase.lower() in text.lower():
            #print(phrase, " in text")
            text = text.replace(phrase.lower(), "")
    
    return text

In [29]:
def binary_category_values(df):
    
    # simplify the data, showing only 1/0 output if a record is tagged with a category
    # create these as new columns with suffix '_ML'
    for category in all_cats_list:
        #replace any nulls with zero
        df[category +'_ML'] = df[category].fillna(0)
        df[category +'_ML'] = df[category +'_ML'].replace('',0)
        df[category +'_ML'] = df[category +'_ML'].apply(lambda x: 1 if x != 0 else 0)
    
    return df

In [30]:
def lemma(text):
    
    # Process the text
    doc = nlp(text)
    
    # Lemmatize each token and join them back into a string
    text = ' '.join([token.lemma_ for token in doc])
    
    return text

In [31]:
def text_process(text):

    # first make message lower case to make other operations simpler
    text = text.lower()
    
    # remove unwanted/not useful text/phrases from the string
    text = strip_specific_text(text)
    
    # remove any number references
    # create a list of characters if the character is a letter or a space
    text = [char for char in text if char.isalpha() or char == " "]
    # join the characters again with 'nothing' - as spaces are included above
    text = "".join(text)
    
    # Check characters to see if they are in punctuation
    text = [char for char in text if char not in string.punctuation]
    # Join the characters again to form the string without punctuation
    text = ''.join(text)
    
    # convert the msg without punctuation to a string of lemmas
    text = lemma(text)
    
    # Now just remove any stopwords, return as a list to be vectorized
    return [word for word in text.split() if word.lower() not in stopwords.words('english')]

In [32]:
df = binary_category_values(df)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [34]:
allcatslist = []
for col in df.columns:
    if col[-2:] == 'ML':
        allcatslist.append(col)
    
print(allcatslist)

['Flooding and Drainage_ML', 'Neighbours and amenities_ML', 'Access, Road Safety_ML', 'History and Heritage_ML', 'Open Countryside_ML', 'AH_ML', 'Conservation and Biodiversity_ML', 'Design_ML', 'Landscape, Appearance, Character_ML', 'Occupancy evidence_ML']


In [35]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import ClassifierChain

classification_dict = {}

# Split the data into training and testing sets
msg_train, msg_test, label_train, label_test = train_test_split(df['RefusalReasons'], df[allcatslist], test_size=0.2)

# Pipeline takes a list of tuples (name of the step, what it's doing)
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('clf', ClassifierChain(SVC(probability=True))),  # Apply Binary Relevance with SVM classifier
])

# Fit the data to the model
pipeline.fit(msg_train, label_train)

# Predict probabilities for the test data
probs = pipeline.predict_proba(msg_test)

# Adjust the threshold to control classification outcome
threshold = 0.5  # Adjust the threshold as desired
preds = (probs >= threshold).astype(int)

# Create a dataframe to store the results of msg_test
result_df = pd.DataFrame(probs, columns=allcatslist)

# Add an identifier column - the original index reference
result_df['Identifier'] = msg_test.index

# Merge the result dataframe with the original dataframe based on the index
result_df = df.merge(result_df, left_index=True, right_on='Identifier', how='left')

# Filter out the records that have non-null values in the specified category column
result_df = result_df[~result_df['Flooding and Drainage_ML_y'].isnull()]

# Compute precision, recall, and other metrics
report = classification_report(label_test, preds, target_names=allcatslist, output_dict=True)

# Make a dataframe from the dictionary
df_report = pd.DataFrame(report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
result_df.head(50)

Unnamed: 0,Ref,Address,Description,DecDate,Link,RefusalReasons,DevType,docfragment,file_ext,filename,Decision,FromSearch,ToSearch,Dev_Type,NDPs Referenced,CLPs Referenced,Perms Referenced,AONB Referenced,Flooding and Drainage,Neighbours and amenities,"Access, Road Safety",History and Heritage,Open Countryside,AH,Conservation and Biodiversity,Design,"Landscape, Appearance, Character",Occupancy evidence,blank_count,Flooding and Drainage_ML_x,Neighbours and amenities_ML_x,"Access, Road Safety_ML_x",History and Heritage_ML_x,Open Countryside_ML_x,AH_ML_x,Conservation and Biodiversity_ML_x,Design_ML_x,"Landscape, Appearance, Character_ML_x",Occupancy evidence_ML_x,Flooding and Drainage_ML_y,Neighbours and amenities_ML_y,"Access, Road Safety_ML_y",History and Heritage_ML_y,Open Countryside_ML_y,AH_ML_y,Conservation and Biodiversity_ML_y,Design_ML_y,"Landscape, Appearance, Character_ML_y",Occupancy evidence_ML_y,Identifier
168.0,PA19/08993,Land South West Of Badgall Cottages Badgall Road Laneast Launceston Cornwall,Self Build Single Storey Dwelling,31/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZCQ3LFGIIP00&activeTab=summary,"the application site is located outside of the established and definable limits of any settlement, and does not represent infill or rounding off as defined in policy 3 of the cornwall local plan strategic policies 2010 - 2030. the proposal would represent a residential unit of accommodation in the countryside where no special justification has been satisfactorily demonstrated. as a consequence the proposed development is considered to represent undesirable, unsustainable, sporadic residential development which is beyond any defined development limit eroding the character and appearance of this part of the area of great landscape value. the development is therefore contrary to policy 3, 7 and 23 of the cornwall local plan and paragraph 79 and section 15 of the national planning policy framework 2019.",,REFUSAL,pdf,PA19_08993-R1FULZ_-_REFUSAL_NOTICE-4781836.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,"Cornwall Local Plan Strategic Policies,Cornwall Local Plan",,,,,,['area of great landscape value'],"['infill or rounding off', 'in the countryside', 'sporadic']",,,,"['character and appearance', 'area of great landscape value']",,7,0,0,0,1,1,0,0,0,1,0,0.002133,0.013273,0.003589,0.944099,1.0,0.003134,0.00407,0.024152,0.989072,0.001962,0
280.0,PA19/08883,1 Tregunnick Lane Seaton Torpoint Cornwall PL11 3JT,Erection of domestic garage with associated works.,21/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZ40S7FG1FE00&activeTab=summary,"the site forms part of a cohesive, high quality, residential development, set back from the road frontage with open front amenity areas. the proposed development by reason of its siting would visually compete with the host dwelling and introduce an inappropriate and overly dominant feature within the street scene, harming the visual amenity of the immediate surroundings. as such, the proposal is contrary to policy 12 of the cornwall local plan 2010-2030, adopted november 2016, together with the provisions of the national planning policy framework 2019, with particular reference to paragraph 127.",,REFUSAL,pdf,PA19_08883-R1FULZ_-_REFUSAL_NOTICE-4769741.pdf,Refusal one reason,01/01/2019,31/12/2019,Householder,,Cornwall Local Plan,,,,,,,,,,,['dominant feature'],,9,0,0,0,0,0,0,0,0,1,0,0.002717,0.831987,0.017356,0.007846,0.101204,0.008771,0.002725,0.4244,0.302287,0.003872,3
58.0,PA19/07752,Highcroft Porkellis Helston Cornwall TR13 0LH,Lawful development certificate for a proposed new conservatory,24/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PX9F8WFGHJI00&activeTab=summary,the proposed construction of the front extension does not constitute permitted development as it fails to comply with class a.1 (e)(i) of part 1 of the town and country planning (general permitted development)(england) order 2015 because it would be located on the principal elevation of the original dwellinghouse. therefore a planning application is required to be submitted to the local planning authority.,,REFUSAL,pdf,PA19_07752-R1LDZ_-_LD__PROPOSED__REFUSAL_NOTICE-4726713.pdf,Refusal one reason,01/01/2019,31/12/2019,CLEUD/CLOPED,,,,,,,,,,,,,,,10,0,0,0,0,0,0,0,0,0,0,0.002078,0.007804,0.014616,0.010057,0.016335,0.003697,0.008521,0.040155,0.027952,0.002088,12
108.0,PA19/07377,Trebean Fore Street Madron TR20 8SH,"Removal of existing garage and store, construction of new garage, store and studio",27/08/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PWMNJ7FGJ9F00&activeTab=summary,"in extending the footprint of the garage/store so as to be in close proximity to the terrace and in significantly increasing the eaves height, ridge height, and massing of the building, the proposed development would appear as overly-domineering, out of character and harmful in the context of the adjacent historic terrace, and would fail to preserve the character and appearance of the conservation areathe proposed works would materially reduce the existing visual quality and character of the setting of the historic terrace and streetscape, and would be harmful to the character and appearance of the conservation area. the works are considered to raise design concerns that cause harm and that are not balanced by other factors acting in the public interest or benefit.the proposal is therefore contrary to policy 12 (paragraphs 1a and 1b) and policy 24 of the cornwall local plan strategic policies 2010-2030 (adopted 22nd november 2016); sections 1, 2.6, 2.9, 4.9, 7.3, 7.4, 7.7 and 7.10 ...",,REFUSAL,pdf,PA19_07377-R1FULZ_-_REFUSAL_NOTICE-4724357.pdf,Refusal one reason,01/01/2019,31/12/2019,Householder,,Cornwall Local Plan Strategic Policies,,,,,,['listed buildings'],,,,['design guide'],"['character and appearance', 'harmful to the character', 'character and appearance', 'character and appearance']",,7,0,0,0,1,0,0,0,1,1,0,0.002807,0.017171,0.002323,1.0,0.001363,0.002659,0.006462,0.989505,0.987871,0.001951,16
70.0,PA19/06450,Land North East Of Badgers Holt Nance Trink Cornwall,Outline planning permission with all matters reserved: New dwelling,08/08/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PV6Z24FG1W400&activeTab=summary,"the site is an undeveloped plot of land lying in the open-countryside at nance to the south of st ives. despite lying in close proximity to a small number of residential properties it is clearly outside of any existing settlement. the proposal for new open market residential accommodation would thereby conflict with policies 1 and 3 of the cornwall local plan (2010-2030) and with paragraph 8 of the nppf (2019) which seek to ensure that new development is sustainably located. furthermore, no special justification has been submitted in support of the application that might allow for its support under policy 7 of the cornwall local plan (2010-2030) or paragraph 79 of the nppf (2019).",,REFUSAL,pdf,PA19_06450-R1OUTZ_-_REFUSAL_NOTICE-4644622.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,Cornwall Local Plan,,,,,,,['open-countryside'],,,,,,9,0,0,0,0,1,0,0,0,0,0,0.005671,0.648185,0.001814,0.005655,0.756758,0.009629,0.003146,0.059716,0.069128,0.00571,27
13.0,PA19/05467,Chiverton Lane Orchard Perran Downs Lane Rosudgeon Cornwall TR20 9HQ,"Retention of extensions to dwelling, erection of extension to south elevation and change of use of land to domestic curtilage",26/07/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PTNIJTFGGII00&activeTab=summary,"the proposed change of use of land to domestic curtilage, retained extensions and proposed extension is considered to represent an unjustified encroachment into the open countryside. such an encroachment of this scale and proportion together with associated domestic paraphernalia would be harmful to the character and appearance of the rural locality and the character of the world heritage site. the proposal would lead to less than substantial harm to the significance of the world heritage site, but such harm is not outweighed by any public benefit. the proposal is therefore contrary to policies 1, 2, 12, 23 and 24 of the cornwall local plan strategic policies 2010-2030 and paragraphs 8, 127, 184 and 196 of the national planning policy framework 2019.",,REFUSAL,pdf,PA19_05467-R1FULZ_-_REFUSAL_NOTICE-4773567.pdf,Refusal one reason,01/01/2019,31/12/2019,Householder,,Cornwall Local Plan Strategic Policies,,,,,,"['world heritage site', 'world heritage site']",['open countryside'],,,,"['harmful to the character', 'character and appearance']",,7,0,0,0,1,1,0,0,0,1,0,0.001557,0.022308,0.001308,1.0,0.848173,0.009514,0.003391,0.053779,0.951261,0.001885,31
210.0,PA19/05295,14 Meadowbank Mylor Bridge TR11 5SW,Construction of new dwelling and formation of two parking spaces each for existing and proposed dwelling,19/06/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PTCOH2FGI1Q00&activeTab=summary,"the proposed dwelling, by reason of its scale, height and proximity to the boundary with no. 34 bells hill, will be un-neighbourly. the development is considered to cause an unacceptable degree of over-bearing to no. 34 bells hill, and also create a sense of enclosure and loss of light in the mornings to the neighbours rear garden. the proposal will also overlook the rear garden of no. 13 meadowbank from the first floor bedroom windows. the development is considered to be contrary to policy 12 of the cornwall local plan strategic policies 2010-2030, and paragraph 127 of the national planning policy framework 2019.",,REFUSAL,pdf,PA19_05295-R1FULZ_-_REFUSAL_NOTICE-4562260.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,Cornwall Local Plan Strategic Policies,,,,['loss of light'],,,,,,,,,9,0,1,0,0,0,0,0,0,0,0,0.001535,0.961275,0.013647,0.016773,0.004349,0.004125,0.016794,0.5,0.053946,0.002423,33
296.0,PA19/05197,6 Penally Court Penally Hill Boscastle Cornwall PL35 0HG,Material change of the use of the land for the stationing of a caravan to be used as ancillary accommodation (resubmission of PA19/01151),17/06/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PT8K3MFG1OU00&activeTab=summary,"by reason of its nature, scale, and siting outside the residential curtilage of 6 penally court, it is considered that the proposed development would be physically separated and not closely related to the principle dwelling. the location of the proposed caravan is not in a sustainable location. the proposal represents unsustainable development in the countryside and the caravan is considered to be harmful to the character and appearance of the conservation area and aonb. this is therefore contrary to policy 1, 2 (subsection 1), 12 and 23 of the cornwall local plan strategic policies 2010 - 2030 and paragraph 124 and 127 of the national planning policy framework 2019.",,REFUSAL,pdf,PA19_05197-R1FULZ_-_REFUSAL_NOTICE-4660172.pdf,Refusal one reason,01/01/2019,31/12/2019,Changes of Use,,Cornwall Local Plan Strategic Policies,,AONB,,,,,['in the countryside'],,,,"['harmful to the character', 'character and appearance']",,8,0,0,0,0,1,0,0,0,1,0,0.001168,0.018639,0.002645,0.00542,0.818669,0.003441,0.059275,0.059361,0.639218,0.002115,35
262.0,PA19/04929,Peartree Farm Trannack Helston TR13 0DQ,Proposed Agricultural Worker's permanent dwelling,17/06/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PSRX45FGJP700&activeTab=summary,"the proposed development raises direct conflict with the requirements of adopted policy in the cornwall local plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances. in this case, it has not been adequately demonstrated at this stage that there is up to date evidence of an essential need of the rural enterprise for the applicant to reside in a permanent dwelling in this location and that the business will remain viable in its own right for the foreseeable future. in the absence of such justification, the development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside and the site is not a sustainable or accessible location for a new dwelling. the development is therefore unsustainable and would cause material environmental harm to the rural character of the area, contrary...",,REFUSAL,pdf,PA19_04929-R1FULZ_-_REFUSAL_NOTICE-4762761.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,"Cornwall Local Plan Strategic Policies,Cornwall Local Plan",,,,,,,['open countryside'],,['environmental harm'],,"['character and appearance', 'character of the area']",,7,0,0,0,0,1,0,1,0,1,0,0.002329,0.001978,0.00392,0.003706,0.999999,0.007764,1.0,0.04163,0.876279,0.002596,38
202.0,PA19/04886,Site East Of Trewindies Trelake Lane Treknow Tintagel Cornwall PL34 0EW,Residential dwelling and associated works,18/06/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PSPU2FFGIAH00&activeTab=summary,"the proposed dwelling, by reason of its location outside of the definable urban form of treknow and with limited physical barrier to its north and west, does not round off the settlement and visually extends the built form into the countryside eroding the character and appearance of this part of the area of outstanding natural beauty and heritage coast, in the absence of any special circumstances to justify it, the proposal represents an unsustainable and sporadic form of development in the countryside and, in the absence of any other material considerations which outweigh such harm, the proposal is contrary to policy md9 of the cornwall aonb management plan 2016-2021, policies 1, 3, 7, 21, 23 and env1 of the cornwall local plan strategic policies 2010 - 2030 (november 2016), and paragraph 79 of the national planning policy framework 2019.",,REFUSAL,pdf,PA19_04886-R1FULZ_-_REFUSAL_NOTICE-4546481.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,Cornwall Local Plan Strategic Policies,,AONB,,,,,"['into the countryside', 'sporadic', 'in the countryside']",,,,"['character and appearance', 'area of outstanding natural beauty', 'beauty', 'heritage coast']",,8,0,0,0,0,1,0,0,0,1,0,0.000991,0.001531,0.00624,0.245647,1.0,0.001239,0.023613,0.035377,0.970468,0.002213,39


In [None]:
"""
precision == abilty to detect true positives (ratio of true postives to true postitives + false positives)
recall == also ability to detect true positives, but is ratio of true positives to true positives + false positives)
F1 Score: Harmonic mean of precision and recall. 1 == best possible score
support == number of instances of the category found in the test dataset, according to the model alone.
support does not have anything to do with the original labels.
"""

In [37]:
def export_excel(df, filename, sheet_name=None):

    from pathlib import Path
    path = "/Users/GlassShark1/Python/Refusals Data/"
    filepath = Path(path + filename + ".xlsx")
    print(filepath)

    if sheet_name is None:
        df.to_excel(filepath, index=False)
    else:
        df.to_excel(filepath, index=False, sheet_name=sheet_name)



In [38]:
export_excel(df, "Verbatim_Ref_Reasons_060623", sheet_name="Data")
export_excel(df_report,"VRR_class_rpt_060623", sheet_name="class_rpt")
export_excel(result_df,"VRR_result_rpt_060623", sheet_name="result_rpt")


/Users/GlassShark1/Python/Refusals Data/Verbatim_Ref_Reasons_060623.xlsx
/Users/GlassShark1/Python/Refusals Data/VRR_class_rpt_060623.xlsx
/Users/GlassShark1/Python/Refusals Data/VRR_result_rpt_060623.xlsx


In [40]:
df_report

Unnamed: 0,Flooding and Drainage_ML,Neighbours and amenities_ML,"Access, Road Safety_ML",History and Heritage_ML,Open Countryside_ML,AH_ML,Conservation and Biodiversity_ML,Design_ML,"Landscape, Appearance, Character_ML",Occupancy evidence_ML,micro avg,macro avg,weighted avg,samples avg
precision,1.0,0.909091,1.0,0.896552,0.924138,0.0,0.777778,0.845361,0.891753,0.952381,0.897472,0.819705,0.896876,0.781325
recall,0.904762,0.857143,0.818182,0.917647,0.937063,0.0,0.777778,0.725664,0.808411,0.833333,0.836387,0.757998,0.836387,0.766426
f1-score,0.95,0.882353,0.9,0.906977,0.930556,0.0,0.777778,0.780952,0.848039,0.888889,0.865854,0.786554,0.864448,0.7581
support,21.0,70.0,55.0,85.0,143.0,3.0,36.0,113.0,214.0,24.0,764.0,764.0,764.0,764.0


In [None]:
"""


LEMMA VERSION STARTS HERE


"""

In [None]:
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher

# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Load the dataframe from an Excel file. For now, keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")
df = og_df.copy()

example_snippets = {
    'Flooding and Drainage': [
        [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'IS_ALPHA': True, 'OP': '*'}, {'LOWER': 'risk', 'LEMMA': 'risk'}],
        [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'zone', 'LEMMA': 'zone'}],
        [{'LOWER': 'demonstrate', 'LEMMA': 'demonstrate'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '26'}],
        [{'LOWER': 'available', 'LEMMA': 'available'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'low', 'LEMMA': 'low'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'significant', 'LEMMA': 'significant'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'not'}, {'LOWER': 'acceptable', 'LEMMA': 'acceptable'}, {'LOWER': 'term', 'LEMMA': 'term'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'critical', 'LEMMA': 'critical'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'floodplain', 'LEMMA': 'floodplain'}],
        [{'LOWER': 'would'}, {'LOWER': 'not'}, {'LOWER': 'appropriately', 'LEMMA': 'appropriately'}, {'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'resilient', 'LEMMA': 'resilient'}],
        [{'LOWER': 'would'}, {'LOWER': 'not'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'access', 'LEMMA': 'access'}, {'LOWER': 'egress', 'LEMMA': 'egress'}, {'LOWER': '1'}, {'LOWER': '100'}, {'LOWER': 'year'}, {'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'event', 'LEMMA': 'event'}],
        [{'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'strategy', 'LEMMA': 'strategy'}],
        [{'LOWER': 'surface', 'LEMMA': 'surface'}, {'LOWER': 'water', 'LEMMA': 'water'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'disposal', 'LEMMA': 'disposal'}],
        [{'LOWER': 'surface', 'LEMMA': 'surface'}, {'LOWER': 'water', 'LEMMA': 'water'}, {'LOWER': 'disposal', 'LEMMA': 'disposal'}]
    ],
    'Neighbours and amenities': [
        [{'LOWER': 'overlooking', 'LEMMA': 'overlook'}],
        [{'LOWER': 'overbearing', 'LEMMA': 'overbear'}],
        [{'LOWER': 'intrusive', 'LEMMA': 'intrusive'}],
        [{'LOWER': 'elevated', 'LEMMA': 'elevate'}, {'LOWER': 'position', 'LEMMA': 'position'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'occupier', 'LEMMA': 'occupier'}, {'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}],
        [{'LOWER': 'loss', 'LEMMA': 'loss'}, {'LOWER': 'privacy', 'LEMMA': 'privacy'}],
        [{'LOWER': 'privacy', 'LEMMA': 'privacy'}, {'LOWER': 'loss', 'LEMMA': 'loss'}],
        [{'LOWER': 'scale', 'LEMMA': 'scale'}, {'LOWER': 'form', 'LEMMA': 'form'}, {'LOWER': 'massing', 'LEMMA': 'mass'}, {'LOWER': 'proximity', 'LEMMA': 'proximity'}, {'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}],
        [{'LOWER': 'loss', 'LEMMA': 'loss'}, {'LOWER': 'light', 'LEMMA': 'light'}],
        [{'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}, {'LOWER': 'resident', 'LEMMA': 'resident'}],
        [{'LOWER': 'amenity', 'LEMMA': 'amenity'}, {'LOWER': 'enjoy', 'LEMMA': 'enjoy'}],
        [{'LOWER': 'paragraph', 'LEMMA': 'paragraph'}, {'LOWER': '127'}, {'LOWER': '130'}, {'LOWER': 'national', 'LEMMA': 'national'}, {'LOWER': 'planning', 'LEMMA': 'plan'}, {'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': 'framework', 'LEMMA': 'framework'}],
        [{'LOWER': 'noise', 'LEMMA': 'noise'}, {'LOWER': 'disturbance', 'LEMMA': 'disturbance'}]
    ],
    'No appropriate access': [
        [{'LOWER': 'vehicular', 'LEMMA': 'vehicular'}, {'LOWER': 'traffic', 'LEMMA': 'traffic'}],
        [{'LOWER': 'accessibility', 'LEMMA': 'accessibility'}, {'LOWER': 'service', 'LEMMA': 'service'}],
        [{'LOWER': 'limited', 'LEMMA': 'limited'}, {'LOWER': 'visibility', 'LEMMA': 'visibility'}],
        [{'LOWER': 'increased', 'LEMMA': 'increase'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'collision', 'LEMMA': 'collision'}],
        [{'LOWER': 'highway', 'LEMMA': 'highway'}, {'LOWER': 'safety', 'LEMMA': 'safety'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'access', 'LEMMA': 'access'}],
        [{'LOWER': 'vehicle', 'LEMMA': 'vehicle'}, {'LOWER': 'pedestrian', 'LEMMA': 'pedestrian'}, {'LOWER': 'conflict', 'LEMMA': 'conflict'}],
        [{'LOWER': 'poor', 'LEMMA': 'poor'}, {'LOWER': 'visibility', 'LEMMA': 'visibility'}],
        [{'LOWER': 'accessing', 'LEMMA': 'access'}, {'LOWER': 'egressing', 'LEMMA': 'egress'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'pass', 'LEMMA': 'pass'}],
        [{'LOWER': 'manoeuvring', 'LEMMA': 'manoeuvre'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'passing', 'LEMMA': 'pass'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '27'}],
        [{'LOWER': 'congestion', 'LEMMA': 'congestion'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'access', 'LEMMA': 'access'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'user', 'LEMMA': 'user'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'road', 'LEMMA': 'road'}, {'LOWER': 'network', 'LEMMA': 'network'}]
    ],
    'History and Heritage': [
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'mining', 'LEMMA': 'mining'}],
        [{'LOWER': 'erode', 'LEMMA': 'erode'}, {'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'WHS', 'LEMMA': 'WHS'}, {'LOWER': 'world', 'LEMMA': 'world'}, {'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'relationship', 'LEMMA': 'relationship'}],
        [{'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'asset', 'LEMMA': 'asset'}],
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'great', 'LEMMA': 'great'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'value', 'LEMMA': 'value'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'settlement', 'LEMMA': 'settlement'}],
        [{'LOWER': 'record', 'LEMMA': 'record'}, {'LOWER': 'morphology', 'LEMMA': 'morphology'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'assessment', 'LEMMA': 'assessment'}],
        [{'LOWER': 'authenticity', 'LEMMA': 'authenticity'}, {'LOWER': 'integrity', 'LEMMA': 'integrity'}],
        [{'LOWER': 'designate', 'LEMMA': 'designate'}, {'LOWER': 'asset', 'LEMMA': 'asset'}],
        [{'LOWER': 'list', 'LEMMA': 'list'}, {'LOWER': 'building', 'LEMMA': 'building'}]
    ],
    'Open Countryside': [
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'countryside', 'LEMMA': 'countryside'}],
        [{'LOWER': 'increase', 'LEMMA': 'increase'}, {'LOWER': 'encroachment', 'LEMMA': 'encroachment'}],
        [{'LOWER': 'scenic', 'LEMMA': 'scenic'}, {'LOWER': 'beauty', 'LEMMA': 'beauty'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'green', 'LEMMA': 'green'}, {'LOWER': 'wedge', 'LEMMA': 'wedge'}],
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'restrain', 'LEMMA': 'restrain'}],
        [{'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'feature', 'LEMMA': 'feature'}],
        [{'LOWER': 'visual', 'LEMMA': 'visual'}, {'LOWER': 'amenity', 'LEMMA': 'amenity'}],
        [{'LOWER': 'promote', 'LEMMA': 'promote'}, {'LOWER': 'recreation', 'LEMMA': 'recreation'}, {'LOWER': 'amenity', 'LEMMA': 'amenity'}],
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '35'}],
        [{'LOWER': 'valued', 'LEMMA': 'value'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'feature', 'LEMMA': 'feature'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'area', 'LEMMA': 'area'}]
    ],
        'AH': [
        [{'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'need', 'LEMMA': 'need'}],
        [{'LOWER': 'need', 'LEMMA': 'need'}, {'LOWER': 'for', 'LEMMA': 'for'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}],
        [{'LOWER': 'no', 'LEMMA': 'no'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'provision', 'LEMMA': 'provision'}],
        [{'LOWER': 'not', 'LEMMA': 'not'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'led', 'LEMMA': 'led'}]
    ],
    'Conservation and Biodiversity': [
        [{'LOWER': 'biodiversty', 'LEMMA': 'biodiversity'}],
        [{'LOWER': 'special', 'LEMMA': 'special'}, {'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'conservation', 'LEMMA': 'conservation'}],
        [{'LOWER': 'SAC', 'LEMMA': 'SAC'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'bat', 'LEMMA': 'bat'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'reptile', 'LEMMA': 'reptile'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'bird', 'LEMMA': 'bird'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'insect', 'LEMMA': 'insect'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'habitat', 'LEMMA': 'habitat'}],
        [{'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'special', 'LEMMA': 'special'}, {'LOWER': 'scientific', 'LEMMA': 'scientific'}, {'LOWER': 'interest', 'LEMMA': 'interest'}],
        [{'LOWER': 'SSSI', 'LEMMA': 'SSSI'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'protect', 'LEMMA': 'protect'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'environment', 'LEMMA': 'environment'}],
        [{'LOWER': 'ecological', 'LEMMA': 'ecological'}, {'LOWER': 'woodland', 'LEMMA': 'woodland'}],
        [{'LOWER': 'foraging', 'LEMMA': 'foraging'}],
        [{'LOWER': 'marsh', 'LEMMA': 'marsh'}],
        [{'LOWER': 'wildlife', 'LEMMA': 'wildlife'}],
        [{'LOWER': 'derogation', 'LEMMA': 'derogation'}]
    ],
    'Design': [
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'respect', 'LEMMA': 'respect'}, {'LOWER': 'its', 'LEMMA': 'its'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'traditional', 'LEMMA': 'traditional'}, {'LOWER': 'pattern', 'LEMMA': 'pattern'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'development', 'LEMMA': 'development'}],
        [{'LOWER': 'by', 'LEMMA': 'by'}, {'LOWER': 'reason', 'LEMMA': 'reason'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'its', 'LEMMA': 'its'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'established', 'LEMMA': 'established'}, {'LOWER': 'pattern', 'LEMMA': 'pattern'}],
        [{'LOWER': 'existing', 'LEMMA': 'existing'}, {'LOWER': 'built', 'LEMMA': 'built'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'incongruous', 'LEMMA': 'incongruous'}],
        [{'LOWER': 'discordant', 'LEMMA': 'discordant'}],
        [{'LOWER': 'uncharacteristic', 'LEMMA': 'uncharacteristic'}],
        [{'LOWER': 'scale', 'LEMMA': 'scale'}, {'LOWER': 'and', 'LEMMA': 'and'}, {'LOWER': 'massing', 'LEMMA': 'massing'}],
        [{'LOWER': 'disproportionate', 'LEMMA': 'disproportionate'}],
        [{'LOWER': 'detrimental', 'LEMMA': 'detrimental'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'street', 'LEMMA': 'street'}, {'LOWER': 'scene', 'LEMMA': 'scene'}],
        [{'LOWER': 'design', 'LEMMA': 'design'}, {'LOWER': 'guide', 'LEMMA': 'guide'}],
        [{'LOWER': 'poor', 'LEMMA': 'poor'}, {'LOWER': 'design', 'LEMMA': 'design'}],
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'reflect', 'LEMMA': 'reflect'}, {'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'distinctiveness', 'LEMMA': 'distinctiveness'}],
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'integrate', 'LEMMA': 'integrate'}],
        [{'LOWER': 'cramped', 'LEMMA': 'cramped'}],
        [{'LOWER': 'contrived', 'LEMMA': 'contrived'}],
        [{'LOWER': 'established', 'LEMMA': 'established'}, {'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'unsympathetic', 'LEMMA': 'unsympathetic'},{'LOWER': 'material', 'LEMMA': 'material'}],
        [{'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'appearance', 'LEMMA': 'appearance'}],
        [{'LOWER': 'bulk', 'LEMMA': 'bulk'}]
    ],
    'Landscape and Appearance': [
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'outstanding', 'LEMMA': 'outstanding'}, {'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'beauty', 'LEMMA': 'beauty'}],
        [{'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'coast', 'LEMMA': 'coast'}],
        [{'LOWER': 'scenic', 'LEMMA': 'scenic'}],
        [{'LOWER': 'beauty', 'LEMMA': 'beauty'}],
        [{'LOWER': 'harm', 'LEMMA': 'harm'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'rural', 'LEMMA': 'rural'}, {'LOWER': 'characteristics', 'LEMMA': 'characteristic'}],
        [{'LOWER': 'visable', 'LEMMA': 'visible'}, {'LOWER': 'from', 'LEMMA': 'from'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'public', 'LEMMA': 'public'}],
        [{'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'and', 'LEMMA': 'and'}, {'LOWER': 'appearance', 'LEMMA': 'appearance'}],
        [{'LOWER': 'widely', 'LEMMA': 'widely'}, {'LOWER': 'visible', 'LEMMA': 'visible'}],
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'great', 'LEMMA': 'great'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'value', 'LEMMA': 'value'}],
        [{'LOWER': 'AGLV', 'LEMMA': 'AGLV'}]
    ]
}

# Create a simple list of all the categories above for headers and to create dfs later
all_cats_list = list(example_snippets.keys())

# Define a function to match categories in the text and create category columns
def match_categories(text):
    # Initializes a PhraseMatcher object named matcher using the vocabulary of the nlp model
    matcher = PhraseMatcher(nlp.vocab)
    # Empty dictionary to store the matched category sentences
    category_sentences = {}

    # Iterate over each category and its examples
    for category, examples in example_snippets.items():
        # Convert examples into spaCy patterns
        category_patterns = [nlp.tokenizer(' '.join([t.get('LOWER', '') for t in example])) for example in examples]

        # Add the patterns to the matcher object
        matcher.add(category, None, *category_patterns)
        # Initialize an empty list for each category in the category_sentences dictionary
        category_sentences[category] = []

    # Process the input text using the nlp model, creating a Doc object named doc
    doc = nlp(text)
    # Match the patterns in the matcher object against the doc, obtain matches in the matches variable
    matches = matcher(doc)

    # Iterate over each match and retrieve the matched category and sentence
    for match_id, start, end in matches:
        matched_category = matcher.vocab.strings[match_id]
        matched_sentence = doc[start:end].text
        # Append the matched sentence to the corresponding category in the category_sentences dictionary
        category_sentences[matched_category].append(matched_sentence)

    return category_sentences

# Apply the function to the 'RefusalReasons' column
category_sentences = df['RefusalReasons'].apply(match_categories).tolist()

category_df = pd.DataFrame(category_sentences)

# Merge the category columns with the original DataFrame
df = pd.concat([df, category_df], axis=1)


In [None]:
for category, examples in example_snippets.items():
    print(category)

In [None]:
cat_dict = {}

In [None]:
df.head()

In [None]:
# 
for col in all_cats_list:
    df[col] = df[col].astype(str)
df = df.replace('[]','')

In [None]:
for col in all_cats_list:
    cat_dict[col] = df.copy()
    cat_dict[col] = cat_dict[col][cat_dict[col][col] != ""]
    print(col)

In [None]:
for reason in cat_dict['Flooding and Drainage']['Flooding and Drainage']:
    print(cat_dict['Flooding and Drainage']['RefusalReasons'])
    print('\n')

In [None]:
len(cat_dict['Flooding and Drainage'])

In [None]:
cat_dict['Flooding and Drainage']['RefusalReasons'][1019]