In [None]:
import nltk
import pandas as pd
import geopandas as gpd
from IPython.display import Markdown, display
import shapefile
import logging
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.font_manager as fm
import gensim
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import pyLDAvis.gensim_models
import numpy as np
import seaborn as sns
from os import path
from PIL import Image
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import pos_tag
from nltk.classify import NaiveBayesClassifier
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.models.phrases import Phraser, Phrases
from collections import Counter
from gensim.corpora import Dictionary


#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('punkt')
#nltk.download('stopwords')


### Preprocessing functions

In [None]:
# preprocessing 
def text_preprocessing(text):
    if text is None:
        return []  # Return an empty list if text is None
   
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text) 

    # Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)

    # Lemmatize each word
    ##text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token) > 1]
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='n') for token in text if len(token) > 1]
    
    # Remove stopwords
    text = [word for word in text if word not in stop_words]
    return text

# Convert list to string
def to_string(text):
    text = ' '.join(map(str, text))
    return text

# clean text
def clean_text(text, exceptions=[]):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text and filter out short words (length <= n)
    words = text.split()
    cleaned_words = [word for word in words if len(word) > 3 or word in exceptions]
    
    # Join the cleaned words back into a string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

#convert words to numbers
def words_to_numbers(text):
    word_to_number = {
        'one': '1',
        'two': '2',
        'three': '3',
        'four': '4',
        'five': '5',
        'six': '6',
        'seven': '7',
        'eight': '8',
        'nine': '9',
        'ten': '10',
        'single': '1'  
    }

    def replace_word(match):
        word = match.group(0).lower()
        return word_to_number.get(word, '')

    numeric_values = re.findall(r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten)\b', str(text), flags=re.IGNORECASE)
    extracted_values = [word_to_number[word.lower()] for word in numeric_values] if numeric_values else None
    cleaned_text = re.sub(r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten|single)\b', replace_word, str(text), flags=re.IGNORECASE)
    return (cleaned_text, extracted_values)

# extract specified words - use class
def extract_classes(text):
    # Pattern to match specific words with word breaks at the end and possible word breaks at the start
    pattern = r'\b(?:class\s\d+|classes(?:\s\d+,)+\d+)\b'
    
    # Find all the matched patterns in the text
    specific_words = re.findall(pattern, str(text), flags=re.IGNORECASE)
    
    # Join the matched patterns into a single string, separated by commas
    specific_words_str = ','.join(specific_words) if specific_words else None
    return specific_words_str

# remove specified words - use class
def remove_classes(text):
    # Pattern to match specific words with word breaks at the end and possible word breaks at the start
    pattern = r'\b(?:class\s\d+|classes(?:\s\d+,)+\d+)\b'
    
    # Replace the matched patterns with an empty string
    cleaned_text = re.sub(pattern, '', str(text), flags=re.IGNORECASE)
    return cleaned_text

# extract and remove numeric with copy to new column

def extract_and_remove_numeric(text):
    def replace_numeric(match):
        return ''
    
    # Pattern to match numbers with word breaks at the start, end, or both
    pattern = r'\b\d+(?:\.\d+)?\b'
    
    # Find numeric values
    numeric_values = re.findall(pattern, str(text))
    extracted_values = ','.join(numeric_values) if numeric_values else None
    
    # Replace standalone numeric values with an empty string
    cleaned_text = re.sub(r'\b(?:{})\b'.format('|'.join(numeric_values)), replace_numeric, str(text))
    return (cleaned_text, extracted_values)

def remove_standalone_numeric(words_list):
    pattern = r'\b\d+(?:\.\d+)?\b'
    return [word for word in words_list if not re.match(pattern, word)]

# extract specific words to new column
def extract_units(text):
    # Pattern to match specific words with word breaks at the end and possible word breaks at the start
    pattern = r'\b(?:hectares|ha|units|dwellinghouse|dwellinghouses|dwellings|metres|m|storey)\b'
    specific_words = re.findall(pattern, str(text), flags=re.IGNORECASE)
    return ','.join(specific_words) if specific_words else None

### Stop words

In [None]:
#  define stop words
stop_words = set(stopwords.words('english'))
# add additional optional stop words to the set
optional_stop_words = {'and','all','by','for','more','none','not','null','of','or','over','than','with','local','major','class','storey'}
stop_words.update(optional_stop_words)

### Read in data for Planning Applications from local directory with subset options and drop year 2020 and 2021 due to covid


In [None]:
#read in planning app data
planapp_gdf_read = gpd.read_file('pub_plnapppol.shp')

In [None]:
planapp_gdf = planapp_gdf_read[['year', 'local_auth', 'proposal','dev_desc','appl_desc','stat_desc', 'xcoord','ycoord']]

In [None]:
#remove 2020 and 2021 covid years
planapp_gdf = planapp_gdf[planapp_gdf['year'] != 2020] 
planapp_gdf = planapp_gdf[planapp_gdf['year'] != 2021]
planapp_gdf = planapp_gdf[planapp_gdf['year'] != 3016]

In [None]:
#random selection
#subset_size = 100000 
planapp_gdf = planapp_gdf_read#.sample(n=subset_size)

### Define columns for Description, Application, Status, Proposal

In [None]:
#create columns for later handling
planapp_gdf["text_proposal"] = planapp_gdf["proposal"].str.lower()
planapp_gdf["text_desc"] = planapp_gdf["dev_desc"].str.lower()
planapp_gdf["text_app"] = planapp_gdf["appl_desc"].str.lower()
planapp_gdf["text_status"] = planapp_gdf["stat_desc"].str.lower()
planapp_gdf["feature"] = planapp_gdf["text_proposal"]

### Words to numbers and remove numerics - also download units and quantities

In [None]:
#change words to numbers
planapp_gdf[['feature_cleaned_1', 'numeric_values']] = planapp_gdf['feature'].apply(words_to_numbers).apply(pd.Series)
# Extract use classes
planapp_gdf['use_class'] = planapp_gdf['feature_cleaned_1'].apply(extract_classes)
#planapp_gdf['feature_cleaned_3'] = planapp_gdf['feature_cleaned_2'].apply(remove_specific_words_with_word_break)
# Save the DataFrame with the original text and the extracted specific words removed to CSV
planapp_gdf['use_class'].to_csv('useclass.csv', index=False)
# Apply the function to the "text_proposal" column to extract numeric values and remove them
planapp_gdf[['feature_cleaned_2', 'quantity']] = planapp_gdf['feature_cleaned_1'].apply(extract_and_remove_numeric).apply(pd.Series)
planapp_gdf['feature_cleaned_2'] = planapp_gdf['feature_cleaned_2'].apply(lambda words_list: remove_standalone_numeric(words_list))
planapp_gdf['feature_cleaned_2'] = planapp_gdf['feature_cleaned_2'].apply(lambda words_list: ''.join(words_list))
#extract and remove units
planapp_gdf['units'] = planapp_gdf['feature'].apply(extract_units)
planapp_gdf_download = planapp_gdf[['units'],['quantity']]
planapp_gdf_download.to_csv('classes_quantities_units2.csv')

### Pull out list of smallest words to identify additional stopwords

In [None]:
# Get NLTK stopwords
nltk_stopwords = set(stopwords.words('english'))

# Filter out None values in the 'text_desc' column
desc_words = ' '.join(list(planapp_gdf['text_desc'].dropna().values))

# Count and find the 3n most frequent words with 4 characters or less
word_counter = Counter(desc_words.split())
most_frequent = [word for word, count in word_counter.most_common(30) if len(word) <= 4]

# Filter the words with 4 characters or less and not present in NLTK stopwords
filtered_most_frequent = [word for word in most_frequent if word not in nltk_stopwords]

# Filter the words with 4 characters or less from the original word_counter
word_counter_filtered = {word: count for word, count in word_counter.items() if len(word) <= 4}

# Convert the filtered word_counter dictionary to a DataFrame
df_filtered = pd.DataFrame.from_dict(word_counter_filtered, orient='index', columns=["count"])
df_filtered.index.name = "words"

# Write the DataFrame to a CSV file
df_filtered.to_csv('word_counter_smallest_proposal.csv')

### Pre processing and visualisation via word count and word cloud

In [None]:
planapp_gdf["feature_cleaned_3"] = list(map(text_preprocessing, planapp_gdf.feature_cleaned_2))

In [None]:
# Flatten the list of lists into a single list of words
desc_words_list = [word for sublist in planapp_gdf['feature_cleaned_3'] for word in sublist]

# Join all word corpus
desc_words = ' '.join(desc_words_list)

# Count and find the 30 most frequent after cleaning
word_counter = Counter(desc_words.split())
most_frequent = word_counter.most_common(20)

# Bar plot of frequent words
fig = plt.figure(1, figsize=(20, 10))
_ = pd.DataFrame(most_frequent, columns=("words", "count"))
sns.barplot(x='words', y='count', data=_, palette='winter')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Combine all words from the 'feature_cleaned_3' column into a single list
desc_words_list = [word for sublist in planapp_gdf['feature_cleaned_3'] for word in sublist]

# Join all words into a single string
data = ' '.join(desc_words_list)

# Calculate word frequencies
word_frequencies = {word: desc_words_list.count(word) for word in set(desc_words_list)}

# Generate the word cloud using word frequencies
wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(word_frequencies)

# Display the generated image
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Tokenisation of phrases to create words 

In [None]:
id2word = gensim.corpora.Dictionary(planapp_gdf["feature_cleaned_3"])
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in planapp_gdf["feature_cleaned_3"]]

### First model run to calculate coherence and perplexity including bigrams and trigrams

In [None]:
# Define the function to calculate perplexity
def calculate_perplexity(lda_model, corpus, dictionary):
    return lda_model.log_perplexity(corpus), lda_model.bound(corpus)

# Define alpha and beta
alpha = 0.5 #'auto'
beta = 0.1 #'auto'  

texts = planapp_gdf["feature_cleaned_3"]

# Create the bigram data
bigram = Phrases(texts, min_count=5, threshold=100)
bigram_phraser = Phraser(bigram)
texts_bigram = [bigram_phraser[text] for text in texts]

# Create the trigram data
trigram = Phrases(bigram[texts_bigram], min_count=5, threshold=100)
trigram_phraser = Phraser(trigram)
texts_trigram = [trigram_phraser[bigram_phraser[text]] for text in texts]

# Create the dictionary from the trigram data
id2word = Dictionary(texts_trigram)

# Term Document Frequency (corpus) for the trigram data
corpus = [id2word.doc2bow(text) for text in texts_trigram]

# Set up the number of topics to iterate over
num_topics_range = range(1, 5)  # Choose the range of the number of topics to evaluate

# Create lists to store the results
number_of_topics = []
coherence_scores = []
perplexity_scores = []

# Train LDA models with different numbers of topics and calculate perplexity and coherence
for num_topics in num_topics_range:
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=id2word,
                             iterations=50,
                             num_topics=num_topics,
                             alpha=alpha,
                             eta=beta,  # 'eta' is used for beta
                             workers=12,
                             passes=10)

    perplexity, bound = calculate_perplexity(lda_model, corpus, id2word)
    perplexity_scores.append(perplexity)
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=texts_trigram,  # Use the trigram data here
                                         dictionary=id2word,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    number_of_topics.append(num_topics)
    coherence_scores.append(coherence_lda)

    # Print coherence score and perplexity for each iteration
    print(f"Iteration {num_topics}: Coherence Score = {coherence_lda}, Perplexity = {perplexity}, Bound = {bound}")

In [None]:
topic_metrics = pd.DataFrame({'number_of_topics': number_of_topics,
                              'coherence_score': coherence_scores,
                              'perplexity_score': perplexity_scores})

# Compute moving average with window size 3 (you can adjust the window size as needed)
topic_metrics['coherence_score_smoothed'] = topic_metrics['coherence_score'].rolling(window=3, min_periods=1).mean()
topic_metrics['perplexity_score_smoothed'] = topic_metrics['perplexity_score'].rolling(window=3, min_periods=1).mean()

# Plot the smoothed coherence scores and perplexity scores on the same plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=topic_metrics, x='number_of_topics', y='coherence_score_smoothed', label='Coherence Score')
#sns.lineplot(data=topic_metrics, x='number_of_topics', y='perplexity_score', label='Perplexity Score')
plt.xlabel('Number of Topics')
plt.ylabel('Score')
plt.title('Coherence vs. Number of Topics')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
#sns.lineplot(data=topic_metrics, x='number_of_topics', y='coherence_score_smoothed', label='Coherence Score')
sns.lineplot(data=topic_metrics, x='number_of_topics', y='perplexity_score_smoothed', label='Perplexity Score')
plt.xlabel('Number of Topics')
plt.ylabel('Score')
plt.title('Perplexity vs. Number of Topics')
plt.legend()
plt.show()

### Second model run with selected number of topics

In [None]:
# Define the number of topics 
n_topics = 13

# Run the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=n_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha=alpha,
                                            eta=beta,
                                           iterations=50,
                                           per_word_topics=True)

### Check for bigrams

In [None]:
bigrams_list = []
# Check if the id2word dictionary contains bigrams and collect them in the bigrams_list
for word in id2word.values():
    if '_' in word:
        bigrams_list.append(word)
# Print the bigrams list
print("List of bigrams:", bigrams_list)



In [None]:
for topic_id in range(num_topics):
    topic_words = lda_model.show_topic(topic_id)
    print(f"Topic {topic_id + 1}:")
    for word, probability in topic_words:
        print(f"{word}: {probability:.4f}")
    print("\n")

### Visualisation using pyLDAvis and save to HTML - not of use with bigrams and trigrams

In [None]:
pyLDAvis.enable_notebook()

# Prepare the visualization data
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)

# Convert topic_info to a DataFrame
topic_info_df = pd.DataFrame(vis_data.topic_info)

# Convert token_table to a DataFrame
token_table_df = pd.DataFrame(vis_data.token_table)

# Convert topic_coordinates to a DataFrame and replace complex numbers with real values
topic_coords_df = pd.DataFrame(vis_data.topic_coordinates.applymap(np.real))

# Convert any remaining NaN values to 0
topic_coords_df = topic_coords_df.fillna(0)

# Create a new PreparedData object with the updated data
updated_data = pyLDAvis.PreparedData(topic_coordinates=topic_coords_df,
                                    topic_info=topic_info_df,
                                    token_table=token_table_df,
                                    R=vis_data.R,
                                    lambda_step=vis_data.lambda_step,
                                    plot_opts=vis_data.plot_opts,
                                    topic_order=vis_data.topic_order)

# Display the visualization
pyLDAvis.display(updated_data)


In [None]:
pyLDAvis.save_html(vis_data, 'lda_visualization_proposal_tri5aug_v2FINAL.html')

### Visualisation of bigrams and trigrams

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt



# Assuming you have already created bigrams and trigrams in your text data

# Calculate frequencies of bigrams and trigrams in the corpus
bigram_frequencies = {}  # Dictionary to store bigram frequencies
trigram_frequencies = {}  # Dictionary to store trigram frequencies

for text in texts_trigram:  # Assuming texts_trigram contains the trigrams
    for word in text:
        if '_' in word:  # Check if it is a bigram or trigram
            if len(word.split('_')) == 2:  # Bigram
                bigram_frequencies[word] = bigram_frequencies.get(word, 0) + 1
            elif len(word.split('_')) == 3:  # Trigram
                trigram_frequencies[word] = trigram_frequencies.get(word, 0) + 1

# Create word clouds for bigrams and trigrams





In [None]:
bigram_wordcloud = WordCloud(background_color='white', width=1000, height=600).generate_from_frequencies(bigram_frequencies)

# Plot the word cloud with larger size
plt.figure(figsize=(12, 8))  # Set the figure size for the plot
plt.imshow(bigram_wordcloud, interpolation='bilinear')
plt.axis('off')
#plt.title('Bigrams Word Cloud', fontsize=20)  # Add a title with a larger font size

plt.show()


In [None]:
trigram_wordcloud = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(trigram_frequencies)

plt.figure(figsize=(12, 8))  # Set the figure size for the plot
plt.imshow(trigram_wordcloud, interpolation='bilinear')
plt.axis('off')
#plt.title('Trigrams Word Cloud')

plt.show()
