In [23]:
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher


# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# load the dataframe from an excel file. For now keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")
df = og_df.copy()

In [25]:
def jupyter_display_max(max_rows):
    # set display so you can see all columns, all rows and all cell contents (up to 1k characters)
    pd.options.display.max_columns = None
    pd.set_option('display.max_rows', max_rows)
    pd.options.display.max_colwidth = 1000
    
# set JN to display full extent of data
jupyter_display_max(df.shape[0])

In [24]:
# Dictionary, each category is the key, each key has a list of snippets
example_snippets = {
    'Flooding and Drainage': [
        'flood risk',
        'flood zones',
        'flood risk assessment',
        'not been demonstrated that the site is safe from flooding',
        'policy 26',
        'other available sites at a lower risk of flooding',
        'significant risk of flooding',
        'not acceptable in terms of the risk of flooding',
        'Critical Drainage Area.',
        'functional floodplain',
        'would not be appropriately flood resilient',
        'would not be safe access or egress in a 1 in 100-year flood event',
        'drainage strategy',
        'surface water drainage disposal',
        'surface water disposal'
    ],
    'Neighbours and amenities': [
        'overlooking',
        'overbearing',
        'intrusive',
        'elevated postion',
        'impact upon occupiers neighbouring',
        'losses of privacy',
        'privacy loss',
        'scale, form, massing and proximity to the neighbouring',
        'loss of light',
        'neighbouring residents',
        'amenity enjoyed',
        'paragraphs 127 and 130 of the National Planning Policy Framework',
        'noise disturbance'
    ],
    'No appropriate access': [
        'vehicular traffic',
        'lack of accessibility to services',
        'limited visibility',
        'increased risk of collision',
        'highway safety',
        'safe and suitable access',
        'vehicle and pedestrian conflict',
        'poor visibility',
        'accessing and egressing',
        'safe or suitable passing',
        'manoeuvring space',
        'passing space',
        'policy 27',
        'congestion',
        'safe and suitable access to the site for all users',
        'impact on the local road network'
    ],
    'History and Heritage': [
        'historic mining',
        'eroding its historic character',
        'WHS',
        'World Heritage Site',
        'historic functional relationship',
        'heritage asset',
        'Area of Great Landscape Value',
        'historic settlement',
        'record of the morphology',
        'historic impact assessment',
        'authenticity and integrity',
        'designated  assets',
        'listed building'
    ],
    'Open Countryside': [
        'open countryside',
        'increase the urban built form',
        'divorced from any settlement',
        'introduction of built form',
        'outside of the defined settlement boundaries',
        'sporadic development',
        'unmistakably rural',
        'unsustainable form of development',
        'intrinsic character',
        'unsustainable and undesirable',
        'reliance on private vehicle',
        'not sustainable development',
        'unsustainable traffic movements'
    ],
    'AH': [
        'affordable housing need',
        'need for affordable housing ',
        'not affordable housing led',
        'no affordable housing provision'
    ],
    'Conservation and Biodiversity': [
        'biodiversty',
        'Special Area of Conservation ',
        'SAC',
        'bats',
        'reptiles',
        'birds',
        'insects',
        'habitat',
        'species',
        'Site of Special Scientific Interest',
        'SSSI',
        'protect the natural environment',
        'foraging',
        'wildlife',
        'derogation',
        'ecological'
        'woodland',
        'marsh'
    ],
    'Design': [
        'fail to respect its form',
        'traditional pattern of development',
        'by reason of its form',
        'established pattern',
        'existing built form',
        'incongruous',
        'discordant',
        'uncharacteristic',
        'scale and massing',
        'disproportionate',
        'detrimental to the street scene',
        'design guide',
        'poor design',
        'fail to reflect local distinctiveness',
        'fail to integrate',
        'unsympathetic material',
        'cramped',
        'contrived',
        'established character of the area',
        'local character',
        'functional appearance',
        'bulk'
        
    ],
    'Landscape and Appearance': [
        'Area of Outstanding Natural Beauty',
        'Heritage Coast',
        'scenic',
        'beauty',
        'harm the rural characteristics',
        'visable from the public',
        'landscape character',
        'character and appearance',
        'widely visable',
        'Area of Great Landscape Value',
        'AGLV'
        
    ]
    # Add more categories and example sentences as needed
}

In [26]:
# create simple list of all the categories above for headers and to create dfs later
all_cats_list = []
for k,v in example_snippets.items():
    all_cats_list.append(k)

In [27]:
for k,v in example_snippets.items():
    print(k),
    print(v)
    print('\n')

Flooding and Drainage
['flood risk', 'flood zones', 'flood risk assessment', 'not been demonstrated that the site is safe from flooding', 'policy 26', 'other available sites at a lower risk of flooding', 'significant risk of flooding', 'not acceptable in terms of the risk of flooding', 'Critical Drainage Area.', 'functional floodplain', 'would not be appropriately flood resilient', 'would not be safe access or egress in a 1 in 100-year flood event', 'drainage strategy', 'surface water drainage disposal', 'surface water disposal']


Neighbours and amenities
['overlooking', 'overbearing', 'intrusive', 'elevated postion', 'impact upon occupiers neighbouring', 'losses of privacy', 'privacy loss', 'scale, form, massing and proximity to the neighbouring', 'loss of light', 'neighbouring residents', 'amenity enjoyed', 'paragraphs 127 and 130 of the National Planning Policy Framework', 'noise disturbance']


No appropriate access
['vehicular traffic', 'lack of accessibility to services', 'limit

In [28]:
# Define a function to match categories in the text and create category columns
def match_categories(text):
    
    # initializes a PhraseMatcher object named matcher using the vocabulary of the nlp model
    matcher = PhraseMatcher(nlp.vocab)
    # empty dictionary to store the matched category sentences
    category_sentences = {}

    # for each category and it's examples - this bit is about getting the matcher to understand your patterns
    # and store them efficiently. It is an instance of the PhraseMatcher class
    for category, examples in example_snippets.items():
        # converts into spaCy Doc objects
        category_patterns = [nlp(example) for example in examples]
        # adds them to the matcher object
        matcher.add(category, None, *category_patterns)
        # initializes empty list for each category in the category_sentences dictionary
        category_sentences[category] = []

    # processes the input text using the nlp model, creating a Doc object named doc
    doc = nlp(text)
    # matches the patterns in the matcher object against the doc, obtains matches in the matches variable
    matches = matcher(doc)

    # iterates over each match and retrieves the matched category and sentence
    for match_id, start, end in matches:
        matched_category = matcher.vocab.strings[match_id]
        matched_sentence = doc[start:end].text
        #  appends matched sentence to the corresponding category in the category_sentences dictionary
        category_sentences[matched_category].append(matched_sentence)
    
    return category_sentences

In [29]:
# Apply the function to the 'RefusalReasons' column
category_sentences = df['RefusalReasons'].apply(match_categories).tolist()

# create a df from the dictionary category_sentences, that has columns for each cat and rows for each record
category_df = pd.DataFrame(category_sentences)

In [30]:
# Merge the category columns with the original DataFrame
df = pd.concat([df, category_df], axis=1)

In [31]:
# for each category, make the column values string (not object) and remove empty lists
for col in all_cats_list:
    df[col] = df[col].astype(str)
df = df.replace('[]','')

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ref,Address,Description,DecDate,Link,RefusalReasons,DevType,docfragment,file_ext,filename,Decision,FromSearch,ToSearch,Dev_Type,Missing Dev_Type,Flooding and Drainage,Neighbours and amenities,No appropriate access,History and Heritage,Open Countryside,AH,Conservation and Biodiversity,Design,Landscape and Appearance
0,0,PA19/08993,Land South West Of Badgall Cottages Badgall Road Laneast Launceston Cornwall,Self Build Single Storey Dwelling,31/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZCQ3LFGIIP00&activeTab=summary,"The application site is located outside of the established and definable limits of any settlement, and does not represent infill or rounding off as defined in policy 3 of the Cornwall Local Plan Strategic Policies 2010 - 2030. The proposal would represent a residential unit of accommodation in the countryside where no special justification has been satisfactorily demonstrated. As a consequence the proposed development is considered to represent undesirable, unsustainable, sporadic residential development which is beyond any defined development limit eroding the character and appearance of this part of the Area of Great Landscape Value. The development is therefore contrary to policy 3, 7 and 23 of The Cornwall Local Plan and paragraph 79 and Section 15 of the National Planning Policy Framework 2019.",,REFUSAL,pdf,PA19_08993-R1FULZ_-_REFUSAL_NOTICE-4781836.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,,,,['Area of Great Landscape Value'],,,,['character and appearance'],"['character and appearance', 'Area of Great Landscape Value']"
1,1,PA19/08982,Malibu Cottage Widemouth Bay Bude Cornwall EX23 0AG,Proposed replacement dwelling with variation of condition 2 in respect of decision PA19/03272 dated 11.09.19,22/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZ7TO0FGFM600&activeTab=summary,"1 The proposed amendments to the approved development by reason of the increased scale, height and mass of the dwelling would significantly increase the visual prominence of the dwelling within a sensitive exposed location, with a high level of public experience and as such the dominant visual appearance would be detrimental to the landscape character and visual amenity of the Area of Great Landscape Value and to users of the beach, South West Coast Path and footpath. This is therefore contrary to Policy 2, 12 and 23 of The Cornwall Local Plan Strategic Policies 2010-2030 and paragraph 127 and 170 of the National Planning Policy Framework 2019.",,REFUSAL,pdf,PA19_08982-R1SECZ_-_REFUSAL_NOTICE-4764998.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,,,,['Area of Great Landscape Value'],,,,,"['landscape character', 'Area of Great Landscape Value']"
2,2,PA19/09063,The Granary Ruan Minor Helston Cornwall TR12 7NA,Outline planning permission with all matters reserved. Detached single dwelling and garage,11/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZF3LMFG1W400&activeTab=summary,"The proposed development raises direct conflict with the requirements of adopted policy in the Cornwall Local Plan as the site is clearly located in the open countryside where new homes will only be permitted where there are special circumstances, none of which have been identified in this particular case. The proposed development would clearly erode the rural character of this location by introducing further built development that would result in material harm to the character and appearance of the countryside, thus not conserving the landscape character and natural beauty of the Area of Outstanding Natural Beauty (AONB) in this location. The site is not a sustainable or accessible location for a new dwelling and the development would therefore be contrary to development plan policy and cause material environmental harm to the rural character of the area, contrary to policies 1, 2, 3, 7, 12, 23 and 27 in the Cornwall Local Plan Strategic Policies 2010 - 2030 ...",,REFUSAL,pdf,PA19_09063-R1OUTZ_-_REFUSAL_NOTICE-4754357.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,,,,,['open countryside'],,,['character and appearance'],"['character and appearance', 'beauty']"
3,3,PA19/08883,1 Tregunnick Lane Seaton Torpoint Cornwall PL11 3JT,Erection of domestic garage with associated works.,21/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZ40S7FG1FE00&activeTab=summary,"The site forms part of a cohesive, high quality, residential development, set back from the road frontage with open front amenity areas. The proposed development by reason of its siting would visually compete with the host dwelling and introduce an inappropriate and overly dominant feature within the street scene, harming the visual amenity of the immediate surroundings. As such, the proposal is contrary to Policy 12 of the Cornwall Local Plan 2010-2030, adopted November 2016, together with the provisions of the National Planning Policy Framework 2019, with particular reference to Paragraph 127.",,REFUSAL,pdf,PA19_08883-R1FULZ_-_REFUSAL_NOTICE-4769741.pdf,Refusal one reason,01/01/2019,31/12/2019,Householder,,,,,,,,,,
4,4,PA19/08792,Land At Goonearl Goonearl Scorrier TR16 5EB,Outline application with some matters reserved for construction of two dwellings to round-off the residential developments of Goonearl,29/10/2019,https://planning.cornwall.gov.uk/online-applications/applicationDetails.do?keyVal=PZ0GL2FGIE800&activeTab=summary,"The proposal for two dwellings outside any settlement and without any special justification results in an unsustainable form of development in the countryside. The accessibility of the location in accessing modes of transport other than the private car is poor which will result in reliance on private cars for the majority of trips reinforcing the unsustainable location of the development. The proposal is therefore contrary to Cornwall Local Plan 2010-2030 policies 1, 3, 7 and 27, St Agnes Neighbourhood Development Plan policies 1 and 2 and the aims at paragraphs 8 and 79 of the National Planning Policy Framework.",,REFUSAL,pdf,PA19_08792-R1OUTZ_-_REFUSAL_NOTICE-4764530.pdf,Refusal one reason,01/01/2019,31/12/2019,Minor - Dwelling,,,,,,['unsustainable form of development'],,,,


In [33]:
# create a dictionary to hold a df per category, filtering out any with no matches for that category
cat_dict = {}
for col in all_cats_list:
    cat_dict[col] = df.copy()
    cat_dict[col] = cat_dict[col][cat_dict[col][col] != ""]
    print(col)

Flooding and Drainage
Neighbours and amenities
No appropriate access
History and Heritage
Open Countryside
AH
Conservation and Biodiversity
Design
Landscape and Appearance


In [42]:
def export_csv(filename):
    from pathlib import Path
    path = "/Users/GlassShark1/Python/Refusals Data/"
    filepath = Path(path + filename + ".csv") 
    print(filepath)
    #filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath) 

In [43]:
filename = 'verbatim refusalreasons 230523'

In [44]:
export_csv(filename)

/Users/GlassShark1/Python/Refusals Data/verbatim refusalreasons 230523.csv


In [36]:
# lemma version found 31:
len(cat_dict['Flooding and Drainage'])

56

In [1]:
"""


LEMMA VERSION STARTS HERE


"""

'\n\n\nLEMMA VERSION STARTS HERE\n\n\n'

In [47]:

}


In [4]:
for k,v in example_snippets.items():
    print(k)
    print('\n')
    print(v)

Flooding and Drainage


[[{'LOWER': 'flood', 'LEMMA': 'flood'}, {'IS_ALPHA': True, 'OP': '*'}, {'LOWER': 'risk', 'LEMMA': 'risk'}], [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'zone', 'LEMMA': 'zone'}], [{'LOWER': 'demonstrate', 'LEMMA': 'demonstrate'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'flood', 'LEMMA': 'flood'}], [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '26'}], [{'LOWER': 'available', 'LEMMA': 'available'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'low', 'LEMMA': 'low'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}], [{'LOWER': 'significant', 'LEMMA': 'significant'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}], [{'LOWER': 'not'}, {'LOWER': 'acceptable', 'LEMMA': 'acceptable'}, {'LOWER': 'term', 'LEMMA': 'term'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}], [{'LOWER': 'critical', 'LEMMA': 'critical'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'area', 'LEM

In [15]:
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher

# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Load the dataframe from an Excel file. For now, keep an unedited copy and one to manipulate
og_df = pd.read_csv("/Users/GlassShark1/Python/Refusals Data/All Refusals 2019-2022 w devtypes.csv")
df = og_df.copy()

example_snippets = {
    'Flooding and Drainage': [
        [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'IS_ALPHA': True, 'OP': '*'}, {'LOWER': 'risk', 'LEMMA': 'risk'}],
        [{'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'zone', 'LEMMA': 'zone'}],
        [{'LOWER': 'demonstrate', 'LEMMA': 'demonstrate'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '26'}],
        [{'LOWER': 'available', 'LEMMA': 'available'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'low', 'LEMMA': 'low'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'significant', 'LEMMA': 'significant'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'not'}, {'LOWER': 'acceptable', 'LEMMA': 'acceptable'}, {'LOWER': 'term', 'LEMMA': 'term'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'flood', 'LEMMA': 'flood'}],
        [{'LOWER': 'critical', 'LEMMA': 'critical'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'floodplain', 'LEMMA': 'floodplain'}],
        [{'LOWER': 'would'}, {'LOWER': 'not'}, {'LOWER': 'appropriately', 'LEMMA': 'appropriately'}, {'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'resilient', 'LEMMA': 'resilient'}],
        [{'LOWER': 'would'}, {'LOWER': 'not'}, {'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'access', 'LEMMA': 'access'}, {'LOWER': 'egress', 'LEMMA': 'egress'}, {'LOWER': '1'}, {'LOWER': '100'}, {'LOWER': 'year'}, {'LOWER': 'flood', 'LEMMA': 'flood'}, {'LOWER': 'event', 'LEMMA': 'event'}],
        [{'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'strategy', 'LEMMA': 'strategy'}],
        [{'LOWER': 'surface', 'LEMMA': 'surface'}, {'LOWER': 'water', 'LEMMA': 'water'}, {'LOWER': 'drainage', 'LEMMA': 'drainage'}, {'LOWER': 'disposal', 'LEMMA': 'disposal'}],
        [{'LOWER': 'surface', 'LEMMA': 'surface'}, {'LOWER': 'water', 'LEMMA': 'water'}, {'LOWER': 'disposal', 'LEMMA': 'disposal'}]
    ],
    'Neighbours and amenities': [
        [{'LOWER': 'overlooking', 'LEMMA': 'overlook'}],
        [{'LOWER': 'overbearing', 'LEMMA': 'overbear'}],
        [{'LOWER': 'intrusive', 'LEMMA': 'intrusive'}],
        [{'LOWER': 'elevated', 'LEMMA': 'elevate'}, {'LOWER': 'position', 'LEMMA': 'position'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'occupier', 'LEMMA': 'occupier'}, {'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}],
        [{'LOWER': 'loss', 'LEMMA': 'loss'}, {'LOWER': 'privacy', 'LEMMA': 'privacy'}],
        [{'LOWER': 'privacy', 'LEMMA': 'privacy'}, {'LOWER': 'loss', 'LEMMA': 'loss'}],
        [{'LOWER': 'scale', 'LEMMA': 'scale'}, {'LOWER': 'form', 'LEMMA': 'form'}, {'LOWER': 'massing', 'LEMMA': 'mass'}, {'LOWER': 'proximity', 'LEMMA': 'proximity'}, {'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}],
        [{'LOWER': 'loss', 'LEMMA': 'loss'}, {'LOWER': 'light', 'LEMMA': 'light'}],
        [{'LOWER': 'neighbouring', 'LEMMA': 'neighbour'}, {'LOWER': 'resident', 'LEMMA': 'resident'}],
        [{'LOWER': 'amenity', 'LEMMA': 'amenity'}, {'LOWER': 'enjoy', 'LEMMA': 'enjoy'}],
        [{'LOWER': 'paragraph', 'LEMMA': 'paragraph'}, {'LOWER': '127'}, {'LOWER': '130'}, {'LOWER': 'national', 'LEMMA': 'national'}, {'LOWER': 'planning', 'LEMMA': 'plan'}, {'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': 'framework', 'LEMMA': 'framework'}],
        [{'LOWER': 'noise', 'LEMMA': 'noise'}, {'LOWER': 'disturbance', 'LEMMA': 'disturbance'}]
    ],
    'No appropriate access': [
        [{'LOWER': 'vehicular', 'LEMMA': 'vehicular'}, {'LOWER': 'traffic', 'LEMMA': 'traffic'}],
        [{'LOWER': 'accessibility', 'LEMMA': 'accessibility'}, {'LOWER': 'service', 'LEMMA': 'service'}],
        [{'LOWER': 'limited', 'LEMMA': 'limited'}, {'LOWER': 'visibility', 'LEMMA': 'visibility'}],
        [{'LOWER': 'increased', 'LEMMA': 'increase'}, {'LOWER': 'risk', 'LEMMA': 'risk'}, {'LOWER': 'collision', 'LEMMA': 'collision'}],
        [{'LOWER': 'highway', 'LEMMA': 'highway'}, {'LOWER': 'safety', 'LEMMA': 'safety'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'access', 'LEMMA': 'access'}],
        [{'LOWER': 'vehicle', 'LEMMA': 'vehicle'}, {'LOWER': 'pedestrian', 'LEMMA': 'pedestrian'}, {'LOWER': 'conflict', 'LEMMA': 'conflict'}],
        [{'LOWER': 'poor', 'LEMMA': 'poor'}, {'LOWER': 'visibility', 'LEMMA': 'visibility'}],
        [{'LOWER': 'accessing', 'LEMMA': 'access'}, {'LOWER': 'egressing', 'LEMMA': 'egress'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'pass', 'LEMMA': 'pass'}],
        [{'LOWER': 'manoeuvring', 'LEMMA': 'manoeuvre'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'passing', 'LEMMA': 'pass'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '27'}],
        [{'LOWER': 'congestion', 'LEMMA': 'congestion'}],
        [{'LOWER': 'safe', 'LEMMA': 'safe'}, {'LOWER': 'suitable', 'LEMMA': 'suitable'}, {'LOWER': 'access', 'LEMMA': 'access'}, {'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'user', 'LEMMA': 'user'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'road', 'LEMMA': 'road'}, {'LOWER': 'network', 'LEMMA': 'network'}]
    ],
    'History and Heritage': [
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'mining', 'LEMMA': 'mining'}],
        [{'LOWER': 'erode', 'LEMMA': 'erode'}, {'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'WHS', 'LEMMA': 'WHS'}, {'LOWER': 'world', 'LEMMA': 'world'}, {'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'relationship', 'LEMMA': 'relationship'}],
        [{'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'asset', 'LEMMA': 'asset'}],
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'great', 'LEMMA': 'great'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'value', 'LEMMA': 'value'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'settlement', 'LEMMA': 'settlement'}],
        [{'LOWER': 'record', 'LEMMA': 'record'}, {'LOWER': 'morphology', 'LEMMA': 'morphology'}],
        [{'LOWER': 'historic', 'LEMMA': 'historic'}, {'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'assessment', 'LEMMA': 'assessment'}],
        [{'LOWER': 'authenticity', 'LEMMA': 'authenticity'}, {'LOWER': 'integrity', 'LEMMA': 'integrity'}],
        [{'LOWER': 'designate', 'LEMMA': 'designate'}, {'LOWER': 'asset', 'LEMMA': 'asset'}],
        [{'LOWER': 'list', 'LEMMA': 'list'}, {'LOWER': 'building', 'LEMMA': 'building'}]
    ],
    'Open Countryside': [
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'countryside', 'LEMMA': 'countryside'}],
        [{'LOWER': 'increase', 'LEMMA': 'increase'}, {'LOWER': 'encroachment', 'LEMMA': 'encroachment'}],
        [{'LOWER': 'scenic', 'LEMMA': 'scenic'}, {'LOWER': 'beauty', 'LEMMA': 'beauty'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'green', 'LEMMA': 'green'}, {'LOWER': 'wedge', 'LEMMA': 'wedge'}],
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'restrain', 'LEMMA': 'restrain'}],
        [{'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'feature', 'LEMMA': 'feature'}],
        [{'LOWER': 'visual', 'LEMMA': 'visual'}, {'LOWER': 'amenity', 'LEMMA': 'amenity'}],
        [{'LOWER': 'promote', 'LEMMA': 'promote'}, {'LOWER': 'recreation', 'LEMMA': 'recreation'}, {'LOWER': 'amenity', 'LEMMA': 'amenity'}],
        [{'LOWER': 'open', 'LEMMA': 'open'}, {'LOWER': 'space', 'LEMMA': 'space'}],
        [{'LOWER': 'policy', 'LEMMA': 'policy'}, {'LOWER': '35'}],
        [{'LOWER': 'valued', 'LEMMA': 'value'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'feature', 'LEMMA': 'feature'}],
        [{'LOWER': 'impact', 'LEMMA': 'impact'}, {'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'area', 'LEMMA': 'area'}]
    ],
        'AH': [
        [{'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'need', 'LEMMA': 'need'}],
        [{'LOWER': 'need', 'LEMMA': 'need'}, {'LOWER': 'for', 'LEMMA': 'for'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}],
        [{'LOWER': 'no', 'LEMMA': 'no'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'provision', 'LEMMA': 'provision'}],
        [{'LOWER': 'not', 'LEMMA': 'not'}, {'LOWER': 'affordable', 'LEMMA': 'affordable'}, {'LOWER': 'housing', 'LEMMA': 'housing'}, {'LOWER': 'led', 'LEMMA': 'led'}]
    ],
    'Conservation and Biodiversity': [
        [{'LOWER': 'biodiversty', 'LEMMA': 'biodiversity'}],
        [{'LOWER': 'special', 'LEMMA': 'special'}, {'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'conservation', 'LEMMA': 'conservation'}],
        [{'LOWER': 'SAC', 'LEMMA': 'SAC'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'bat', 'LEMMA': 'bat'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'reptile', 'LEMMA': 'reptile'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'bird', 'LEMMA': 'bird'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'insect', 'LEMMA': 'insect'}, {'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'habitat', 'LEMMA': 'habitat'}],
        [{'LOWER': 'species', 'LEMMA': 'species'}],
        [{'LOWER': 'site', 'LEMMA': 'site'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'special', 'LEMMA': 'special'}, {'LOWER': 'scientific', 'LEMMA': 'scientific'}, {'LOWER': 'interest', 'LEMMA': 'interest'}],
        [{'LOWER': 'SSSI', 'LEMMA': 'SSSI'}, {'LOWER': 'site', 'LEMMA': 'site'}],
        [{'LOWER': 'protect', 'LEMMA': 'protect'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'environment', 'LEMMA': 'environment'}],
        [{'LOWER': 'ecological', 'LEMMA': 'ecological'}, {'LOWER': 'woodland', 'LEMMA': 'woodland'}],
        [{'LOWER': 'foraging', 'LEMMA': 'foraging'}],
        [{'LOWER': 'marsh', 'LEMMA': 'marsh'}],
        [{'LOWER': 'wildlife', 'LEMMA': 'wildlife'}],
        [{'LOWER': 'derogation', 'LEMMA': 'derogation'}]
    ],
    'Design': [
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'respect', 'LEMMA': 'respect'}, {'LOWER': 'its', 'LEMMA': 'its'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'traditional', 'LEMMA': 'traditional'}, {'LOWER': 'pattern', 'LEMMA': 'pattern'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'development', 'LEMMA': 'development'}],
        [{'LOWER': 'by', 'LEMMA': 'by'}, {'LOWER': 'reason', 'LEMMA': 'reason'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'its', 'LEMMA': 'its'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'established', 'LEMMA': 'established'}, {'LOWER': 'pattern', 'LEMMA': 'pattern'}],
        [{'LOWER': 'existing', 'LEMMA': 'existing'}, {'LOWER': 'built', 'LEMMA': 'built'}, {'LOWER': 'form', 'LEMMA': 'form'}],
        [{'LOWER': 'incongruous', 'LEMMA': 'incongruous'}],
        [{'LOWER': 'discordant', 'LEMMA': 'discordant'}],
        [{'LOWER': 'uncharacteristic', 'LEMMA': 'uncharacteristic'}],
        [{'LOWER': 'scale', 'LEMMA': 'scale'}, {'LOWER': 'and', 'LEMMA': 'and'}, {'LOWER': 'massing', 'LEMMA': 'massing'}],
        [{'LOWER': 'disproportionate', 'LEMMA': 'disproportionate'}],
        [{'LOWER': 'detrimental', 'LEMMA': 'detrimental'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'street', 'LEMMA': 'street'}, {'LOWER': 'scene', 'LEMMA': 'scene'}],
        [{'LOWER': 'design', 'LEMMA': 'design'}, {'LOWER': 'guide', 'LEMMA': 'guide'}],
        [{'LOWER': 'poor', 'LEMMA': 'poor'}, {'LOWER': 'design', 'LEMMA': 'design'}],
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'reflect', 'LEMMA': 'reflect'}, {'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'distinctiveness', 'LEMMA': 'distinctiveness'}],
        [{'LOWER': 'fail', 'LEMMA': 'fail'}, {'LOWER': 'to', 'LEMMA': 'to'}, {'LOWER': 'integrate', 'LEMMA': 'integrate'}],
        [{'LOWER': 'cramped', 'LEMMA': 'cramped'}],
        [{'LOWER': 'contrived', 'LEMMA': 'contrived'}],
        [{'LOWER': 'established', 'LEMMA': 'established'}, {'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'area', 'LEMMA': 'area'}],
        [{'LOWER': 'local', 'LEMMA': 'local'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'unsympathetic', 'LEMMA': 'unsympathetic'},{'LOWER': 'material', 'LEMMA': 'material'}],
        [{'LOWER': 'functional', 'LEMMA': 'functional'}, {'LOWER': 'appearance', 'LEMMA': 'appearance'}],
        [{'LOWER': 'bulk', 'LEMMA': 'bulk'}]
    ],
    'Landscape and Appearance': [
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'outstanding', 'LEMMA': 'outstanding'}, {'LOWER': 'natural', 'LEMMA': 'natural'}, {'LOWER': 'beauty', 'LEMMA': 'beauty'}],
        [{'LOWER': 'heritage', 'LEMMA': 'heritage'}, {'LOWER': 'coast', 'LEMMA': 'coast'}],
        [{'LOWER': 'scenic', 'LEMMA': 'scenic'}],
        [{'LOWER': 'beauty', 'LEMMA': 'beauty'}],
        [{'LOWER': 'harm', 'LEMMA': 'harm'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'rural', 'LEMMA': 'rural'}, {'LOWER': 'characteristics', 'LEMMA': 'characteristic'}],
        [{'LOWER': 'visable', 'LEMMA': 'visible'}, {'LOWER': 'from', 'LEMMA': 'from'}, {'LOWER': 'the', 'LEMMA': 'the'}, {'LOWER': 'public', 'LEMMA': 'public'}],
        [{'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'character', 'LEMMA': 'character'}],
        [{'LOWER': 'character', 'LEMMA': 'character'}, {'LOWER': 'and', 'LEMMA': 'and'}, {'LOWER': 'appearance', 'LEMMA': 'appearance'}],
        [{'LOWER': 'widely', 'LEMMA': 'widely'}, {'LOWER': 'visible', 'LEMMA': 'visible'}],
        [{'LOWER': 'area', 'LEMMA': 'area'}, {'LOWER': 'of', 'LEMMA': 'of'}, {'LOWER': 'great', 'LEMMA': 'great'}, {'LOWER': 'landscape', 'LEMMA': 'landscape'}, {'LOWER': 'value', 'LEMMA': 'value'}],
        [{'LOWER': 'AGLV', 'LEMMA': 'AGLV'}]
    ]
}

# Create a simple list of all the categories above for headers and to create dfs later
all_cats_list = list(example_snippets.keys())

# Define a function to match categories in the text and create category columns
def match_categories(text):
    # Initializes a PhraseMatcher object named matcher using the vocabulary of the nlp model
    matcher = PhraseMatcher(nlp.vocab)
    # Empty dictionary to store the matched category sentences
    category_sentences = {}

    # Iterate over each category and its examples
    for category, examples in example_snippets.items():
        # Convert examples into spaCy patterns
        category_patterns = [nlp.tokenizer(' '.join([t.get('LOWER', '') for t in example])) for example in examples]

        # Add the patterns to the matcher object
        matcher.add(category, None, *category_patterns)
        # Initialize an empty list for each category in the category_sentences dictionary
        category_sentences[category] = []

    # Process the input text using the nlp model, creating a Doc object named doc
    doc = nlp(text)
    # Match the patterns in the matcher object against the doc, obtain matches in the matches variable
    matches = matcher(doc)

    # Iterate over each match and retrieve the matched category and sentence
    for match_id, start, end in matches:
        matched_category = matcher.vocab.strings[match_id]
        matched_sentence = doc[start:end].text
        # Append the matched sentence to the corresponding category in the category_sentences dictionary
        category_sentences[matched_category].append(matched_sentence)

    return category_sentences

# Apply the function to the 'RefusalReasons' column
category_sentences = df['RefusalReasons'].apply(match_categories).tolist()

category_df = pd.DataFrame(category_sentences)

# Merge the category columns with the original DataFrame
df = pd.concat([df, category_df], axis=1)


In [16]:
for category, examples in example_snippets.items():
    print(category)

Flooding and Drainage
Neighbours and amenities
No appropriate access
History and Heritage
Open Countryside
AH
Conservation and Biodiversity
Design
Landscape and Appearance


In [17]:
cat_dict = {}

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ref,Address,Description,DecDate,Link,RefusalReasons,DevType,docfragment,file_ext,...,Missing Dev_Type,Flooding and Drainage,Neighbours and amenities,No appropriate access,History and Heritage,Open Countryside,AH,Conservation and Biodiversity,Design,Landscape and Appearance
0,0,PA19/08993,Land South West Of Badgall Cottages Badgall Ro...,Self Build Single Storey Dwelling,31/10/2019,https://planning.cornwall.gov.uk/online-applic...,The application site is located outside of t...,,REFUSAL,pdf,...,,[],[],[],[],[],[],[],[character and appearance],[character and appearance]
1,1,PA19/08982,Malibu Cottage Widemouth Bay Bude Cornwall EX2...,Proposed replacement dwelling with variation o...,22/10/2019,https://planning.cornwall.gov.uk/online-applic...,1 The proposed amendments to the approved dev...,,REFUSAL,pdf,...,,[],[],[],[],[],[],[],[],[landscape character]
2,2,PA19/09063,The Granary Ruan Minor Helston Cornwall TR12 7NA,Outline planning permission with all matters r...,11/10/2019,https://planning.cornwall.gov.uk/online-applic...,The proposed development raises direct conflic...,,REFUSAL,pdf,...,,[],[],[],[],[open countryside],[],[],[character and appearance],"[character and appearance, beauty]"
3,3,PA19/08883,1 Tregunnick Lane Seaton Torpoint Cornwall PL1...,Erection of domestic garage with associated wo...,21/10/2019,https://planning.cornwall.gov.uk/online-applic...,"The site forms part of a cohesive, high qual...",,REFUSAL,pdf,...,,[],[],[],[],[visual amenity],[],[],[],[]
4,4,PA19/08792,Land At Goonearl Goonearl Scorrier TR16 5EB,Outline application with some matters reserved...,29/10/2019,https://planning.cornwall.gov.uk/online-applic...,The proposal for two dwellings outside any set...,,REFUSAL,pdf,...,,[],[],[],[],[],[],[],[],[]


In [19]:
# 
for col in all_cats_list:
    df[col] = df[col].astype(str)
df = df.replace('[]','')

In [20]:
for col in all_cats_list:
    cat_dict[col] = df.copy()
    cat_dict[col] = cat_dict[col][cat_dict[col][col] != ""]
    print(col)

Flooding and Drainage
Neighbours and amenities
No appropriate access
History and Heritage
Open Countryside
AH
Conservation and Biodiversity
Design
Landscape and Appearance


In [21]:
for reason in cat_dict['Flooding and Drainage']['Flooding and Drainage']:
    print(cat_dict['Flooding and Drainage']['RefusalReasons'])
    print('\n')

80      In the absence  of a suitable site-specific  F...
113     The submitted red edged site location subject ...
115     The proposal would introduce a new residential...
274      1 The site is located within Flood  Zone 1 (l...
307      1 The proposal relates to a site within Flood...
320      1 The access arrangement  proposed  to serve ...
332      1 The proposal is located  within  the open c...
410      1 The application fails to demonstrate the pr...
426      1 The proposed surface  water drainage system...
588      1The proposal represents a cramped form of de...
608      1 The site comprises an undeveloped steeply s...
755     Given the inadequacy of the submitted Flood Ri...
878      1 A dwelling at this location would fail to r...
895      1 A dwelling at this location would fail to r...
1013     1 The proposed development is for a home offi...
1019    1 The proposed dwellings represent an extensio...
1039     1 The proposal, by reason of its associated c...
1091     1 The

In [22]:
len(cat_dict['Flooding and Drainage'])

31

In [None]:
cat_dict['Flooding and Drainage']['RefusalReasons'][1019]