## Import packages

In [1]:
#import pdfplumber
import os
import glob
import PyPDF2
import PyPDF4
from nltk import word_tokenize
import nltk
import spacy
import re
from scipy.spatial.distance import cityblock, euclidean, cosine
import numpy as np
import pandas as pd
import gensim.downloader as api



# Section 1: Extract text and clean
### Create text files from Bristol Climate Action Plan

In [4]:
if not os.path.isdir('Bristol'): 
    os.mkdir('Bristol')

In [5]:
pdf = PyPDF4.PdfFileReader(open('one-city-climate-strategy.pdf', 'rb'))
for i in range(pdf.numPages):
    num = str(i)
    text = pdf.pages[i].extractText()
    with open(f'Bristol/bristol{num}.txt', 'w', encoding='utf-8') as f:
        f.write(text)


### Now tidy up each file

In [4]:
files = glob.glob('Bristol/bristol*.txt')

#create an empty text file to receive the text...
with open('Bristol/full_text.txt', 'w') as f:
    pass

In [5]:
#First, delete empty lines
for i in range(len(files)):
    item = files[i]
    with open(item, 'r+', encoding='utf-8') as f:
        text = f.readlines()
        #print(text)
        new_text = ''
        for line in text:
            new_text += line.replace('\n', '')
        # now append text to master text file
        with open('Bristol/bristol_full.txt', 'a', encoding='utf-8') as p:
            p.write(new_text)


### Some further cleaning

Inserting spaces between digits followed immediately by letters, or letters immediately followed by digits

In [6]:
#Define a regex expression to recognise letters followed by digits or digits followed by letters
dl = re.compile(r'(?<=\d)(?=[^\d\s,.])|(?<=[^\d\s,.])(?=\d)')

In [7]:
# Run a test
test_string = 'Bristol will be carbon neutral by2030 and from 2040will only use 140EVs. 2025 will see the launch of 1.5kWh battery.'
print(dl.sub(' ', test_string))

Bristol will be carbon neutral by 2030 and from 2040 will only use 140 EVs. 2025 will see the launch of 1.5 kWh battery.


In [8]:
#Open, take text, amend, and then wipe and rewrite
climate_plan_text = ''
with open('Bristol/bristol_full.txt', 'r+', encoding='utf-8') as f:
    text = f.read()
    climate_plan_text += dl.sub(' ', text)

with open('Bristol/bristol_full.txt', 'w', encoding='utf-8') as f:
    f.write(climate_plan_text)

In some places, years are followed by an extra digit - usually a rogue page number, or something like that.

In [9]:
# Define a regex that matches the 5th digit in a 5 digit number. We assume that any other numbers use comma separations. 
yr = re.compile(r'(\d)(?<=\d{5})')

In [10]:
test_string_2 = 'climate resilient Bristol by 20302 Foreword'
print(yr.sub('', test_string_2))

climate resilient Bristol by 2030 Foreword


In [11]:
# Now apply this to the document:

climate_plan_text = yr.sub('', climate_plan_text)
with open('Bristol/bristol_full.txt', 'w', encoding = 'utf-8') as f:
    f.write(climate_plan_text)

We have some full stops surrounded by words, e.g. "action.This"

In [12]:
#Define a regex expression to match the full stops surrounded by letters and insert a space.
fs = re.compile(r'\.(?=\w)')

In [13]:
#Run a test...
test_string_3 = 'in Bristol.This will help.Testing, testing. Some are. Written correctly.'
print(fs.sub('. ', test_string_3))

in Bristol. This will help. Testing, testing. Some are. Written correctly.


In [14]:
# Now apply to text

climate_plan_text = fs.sub('. ', climate_plan_text)

with open('Bristol/bristol_full.txt', 'w', encoding = 'utf-8') as f:
    f.write(climate_plan_text)

 Now we have 'Bristol/bristol_full.txt' as the whole climate action plan

# Section 2: Experimenting with NLP. 
## Part 1: Experimenting with NLTK (not working)

In [15]:
climate_plan_text = ''
with open('Bristol/bristol_full.txt', 'r', encoding = 'utf-8') as f:
    climate_plan_text += f.read()

In [16]:
print(len(climate_plan_text))

582256


In [17]:
climate_plan_text[100:150]

'From the One City Environmental Sustainability Boa'

### Sentence segmentation

In [18]:
#Instantiate a sentence tokenizer, segment and then tokenize the sentences. 

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_tokenizer.tokenize(climate_plan_text)

In [19]:
#A bit of quality control.
print(sents[30:40])

['The strategy will be delivered by organisations, communities and individuals working together.', 'The engagement and collaboration is just the start of what is needed.', 'We will continue to have an open dialogue and engage with individuals, households, organisations and businesses over the coming decade.', 'The evidence baseTo develop the One City Climate Strategy, we commissioned evidence-based reports on reducing greenhouse gas emissions and on improving our resilience to the impacts of a changing climate.', 'The ˜ve studies that inform this strategy are: 1.', 'A scope 1 and 2 baseline and gap analysis.', 'This sets out the baseline, historic trends, as well as the trajectory from actions that are already planned or in place and the trajectory to meet the UK national net zero target by 2050.', '2.', 'Total business emissions study.', 'This was an assessment of scope 1, 2 and 3 emissions associated with business activities in Bristol.']


In [20]:
# Word tokenize each sentence. 
sentences = [nltk.word_tokenize(sent) for sent in sents]

In [21]:
# And position tag the words in each sentence. 
sentences_pos = [nltk.pos_tag(sent) for sent in sentences]

In [22]:
# A bit more quality control
print(sentences_pos[45])

[('Net', 'JJ'), ('zero', 'NN'), ('by', 'IN'), ('2030', 'CD'), ('scope', 'NN'), ('1', 'CD'), ('and', 'CC'), ('2', 'CD'), ('study', 'NN'), ('.', '.')]


### Now for some chunking - fire up the chunker!

In [23]:
#first define some chunk grammar
grammar = r"""
NP: {<DT|PP\$>?<JJ>*<NN>} #chunk determiner/possessssive, adjectives and noun
"""

In [24]:
# Second, pass the grammar to the chunk parser
cp = nltk.RegexpParser(grammar)

In [25]:
#Finish this bit later...

## Part 2: Topic modelling with gensim

This works & will extract topics

In [26]:
#First tokenize the text with scikit-learn
nlp = spacy.load('en_core_web_sm')
nlp
from spacy.lang.en import English
parser = English()

In [27]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [28]:
# We use NLTK’s Wordnet to find the meanings of words, synonyms, antonyms, and more. In addition, we use WordNetLemmatizer to get the root word.
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ben\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
# Filter out stop words:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ben\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Now we can define a function to prepare the text for topic modelling:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [31]:
text = prepare_text_for_lda(climate_plan_text)

In [32]:
text_data = [prepare_text_for_lda(sent) for sent in sents]


### LDA = Latent Dirichlet Allocation

This topic modelling is an unsupervised machine learning method that helps us discover hidden semantic structures in a document, that allows us to learn topic representations of documents in a corpus. The model can be applied to any kinds of labels on documents, such as tags on posts on a website.

In [33]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(sent) for sent in text_data]

In [34]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [35]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words = 4)
for topic in topics:
    print(topic)

(0, '0.055*"electricity" + 0.027*"demand" + 0.021*"change" + 0.019*"renewable"')
(1, '0.030*"enable" + 0.029*"strategy" + 0.029*"across" + 0.026*"change"')
(2, '0.030*"consumption" + 0.030*"energy" + 0.021*"approach" + 0.015*"business"')
(3, '0.031*"support" + 0.029*"national" + 0.026*"local" + 0.026*"regional"')
(4, '0.026*"infrastructure" + 0.022*"support" + 0.015*"water" + 0.015*"clear"')
(5, '0.081*"bristol" + 0.042*"emission" + 0.036*"services" + 0.027*"public"')
(6, '0.054*"infrastructure" + 0.035*"green" + 0.025*"climate" + 0.025*"change"')
(7, '0.031*"delivery" + 0.030*"evidence" + 0.027*"bristol" + 0.025*"climate"')
(8, '0.048*"climate" + 0.024*"bristol" + 0.022*"carbon" + 0.021*"theme"')
(9, '0.029*"bristol" + 0.018*"programme" + 0.015*"include" + 0.015*"infrastructure"')
(10, '0.045*"climate" + 0.018*"impact" + 0.017*"report" + 0.015*"emission"')
(11, '0.038*"climate" + 0.034*"bristol" + 0.027*"change" + 0.021*"emergency"')
(12, '0.050*"climate" + 0.038*"change" + 0.037*"str

## Part 3: Bag of words analysis, sentence by sentence (not yet working)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [37]:
# We'll use 'sents', which is a list generated above, each sentence is an item in the list. We'll try to find the topic of each sentence. 
print(sents[:5])

['1 ClimateOne CityStrategyA strategy for a carbon neutral, climate resilient Bristol by 2030 ForewordFrom the One City Environmental Sustainability BoardWe are facing a climate emergency.', 'As a city we need to act now to reduce direct and indirect carbon emissions to net zero.', 'We need to prepare and adapt to deal with the projected impacts of climate change.', 'In the One City Plan, Bristol committed to becoming carbon neutral and climate resilient by 2030.', 'To achieve this, over the next decade, we need to radically rethink how we live, work and invest in the city.']


In [38]:
# instantiate the vectorizer and fit it to our sentences
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(sents)

In [39]:
#convert counts to tf-idf
transformer = TfidfTransformer(norm = None)

#initialize and fit TfidfVectorizer
tfidf_scores_transformed = transformer.fit_transform(counts)

In [40]:
#check if tf-idf scores are equal
vectorizer = TfidfVectorizer(norm = None)
tfidf_scores = vectorizer.fit_transform(sents)

In [41]:
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
    print(pd.DataFrame({'Are the tf-idf scores all the same?': ['YES']}))
else:
    print(pd.DataFrame({'Are the tf-idf scores all the same?': ['No, something is wrong :(']}))

  Are the tf-idf scores all the same?
0                                 YES


In [42]:
# get vocabulary of terms
try:
    feature_names = vectorizer.get_feature_names()
except:
    print('Hm, no')
    pass

In [43]:
# get article index
try:
    sent_index = [f'Sentence {i+1}' for i in range(len(sents))]
    print('Hm, yes')
except:
    print('Hm, no')
    pass


Hm, yes


In [44]:
# create pandas DataFrame with word counts
try:
  df_word_counts = pd.DataFrame(counts.T.todense(), index=feature_names, columns=sent_index)
  print(df_word_counts)
except:
  pass

               Sentence 1  Sentence 2  Sentence 3  Sentence 4  Sentence 5  \
00                      0           0           0           0           0   
000                     0           0           0           0           0   
005                     0           0           0           0           0   
0217                    0           0           0           0           0   
05                      0           0           0           0           0   
...                   ...         ...         ...         ...         ...   
ﬁenabling               0           0           0           0           0   
ﬁgreenﬁ                 0           0           0           0           0   
ﬁin                     0           0           0           0           0   
ﬁthe                    0           0           0           0           0   
ﬂintroduction           0           0           0           0           0   

               Sentence 6  Sentence 7  Sentence 8  Sentence 9  Sentence 10 

In [45]:
#Create dataframe with tf-idf scores
try:
  df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(), index=feature_names, columns=sent_index)
  print(df_tf_idf)
except:
  pass

try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=sent_index)
  print(df_tf_idf)
except:
  pass

               Sentence 1  Sentence 2  Sentence 3  Sentence 4  Sentence 5  \
00                    0.0         0.0         0.0         0.0         0.0   
000                   0.0         0.0         0.0         0.0         0.0   
005                   0.0         0.0         0.0         0.0         0.0   
0217                  0.0         0.0         0.0         0.0         0.0   
05                    0.0         0.0         0.0         0.0         0.0   
...                   ...         ...         ...         ...         ...   
ﬁenabling             0.0         0.0         0.0         0.0         0.0   
ﬁgreenﬁ               0.0         0.0         0.0         0.0         0.0   
ﬁin                   0.0         0.0         0.0         0.0         0.0   
ﬁthe                  0.0         0.0         0.0         0.0         0.0   
ﬂintroduction         0.0         0.0         0.0         0.0         0.0   

               Sentence 6  Sentence 7  Sentence 8  Sentence 9  Sentence 10 

In [46]:
for i in range(100, 200):
  print(df_tf_idf[[f'Sentence {i}']].idxmax())

Sentence 100    lasts
dtype: object
Sentence 101    cold
dtype: object
Sentence 102    still
dtype: object
Sentence 103    impacts
dtype: object
Sentence 104    clearer
dtype: object
Sentence 105    causing
dtype: object
Sentence 106    progressively
dtype: object
Sentence 107    know
dtype: object
Sentence 108    ranging
dtype: object
Sentence 109    2003
dtype: object
Sentence 110    supply
dtype: object
Sentence 111    risk
dtype: object
Sentence 112    foundations
dtype: object
Sentence 113    projected
dtype: object
Sentence 114    bristolintroductionbristol
dtype: object
Sentence 115    yet
dtype: object
Sentence 116    uk
dtype: object
Sentence 117    rst
dtype: object
Sentence 118    concerned
dtype: object
Sentence 119    citybristol
dtype: object
Sentence 120    other
dtype: object
Sentence 121    91
dtype: object
Sentence 122    contributor
dtype: object
Sentence 123    most
dtype: object
Sentence 124    house
dtype: object
Sentence 125    fuel
dtype: object
Sentence 126    

### Start again - try a Gensim analysis....
https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#sphx-glr-auto-examples-core-run-corpora-and-vector-spaces-py

# Phrase / Key Word matching
Using SpaCy 

In [47]:
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import PhraseMatcher
phrase_matcher = PhraseMatcher(nlp.vocab)

In [56]:
# phrases = ['electric vehicles', 'electric vehicle', 'charging', 'transport', 'cars', 'zero emission vehicles', 'emission vehicles']

# patterns = [nlp(text) for text in phrases]

# phrase_matcher.add('EVs', None, *patterns)

# sentence = nlp(climate_plan_text)

# matched_phrases = phrase_matcher(sentence)

# for match_id, start, end in matched_phrases:
#     string_id = nlp.vocab.strings[match_id]
#     span = sentence[start:end]
#     print(match_id, string_id, start, end, span.text)
print(len(climate_plan_text))

582256


In [57]:
print(climate_plan_text)

olders and the public with transparency about the process. As one of our key principles is learning and evolving this plan, this review will feed into continuous review and revision of the delivery plans. This will enable us to be agile and expend e˙ort on the most e˙ective interventions. Next stepsIn 2020, and in parallel with the delivery planning, we need to develop a clear framework for monitoring and review in subsequent years. We will:  -Develop indicators for mitigation and resilience action covering both outputs (what is delivered through actions) and outcomes (the di˙erence made by the outputs). This monitoring will aim to identify and mitigate any unintended negative consequences.  -Establish a transparent process to monitor delivery, communicate progress and update planning. A note on o˝settingAchieving carbon neutrality will mean managing residual emissions remaining after all planned interventions have been made. This will typically involve o˙setting; this is a complex iss

In [49]:
text = climate_plan_text

phrases  = ['electric vehicles', 'electric vehicle', 'charging', 'cars', 'zero emission vehicles', 'emission vehicles']

patterns = [nlp(t) for t in phrases]

phrase_matcher.add('EVs', None, *patterns)

doc = nlp(text)



In [50]:
tags = []
tagged_sentences = []

In [51]:
matches = phrase_matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    tags.append(span.text)
    tagged_sentences.append(span.sent)
    print(span.text)

charging
charging
electric vehicles
electric vehicle
charging
cars
cars
charging
cars
emission vehicles
electric vehicle
charging
charging
emission vehicles
emission vehicles
charging
charging
electric vehicles
electric vehicle
charging
cars
cars
charging
cars
emission vehicles
electric vehicle
charging
charging
emission vehicles
emission vehicles
charging
charging
electric vehicles
electric vehicle
charging
cars
cars
charging
cars
emission vehicles
electric vehicle
charging
charging
emission vehicles
emission vehicles
charging
charging
electric vehicles
electric vehicle
charging
cars
cars
charging
cars
emission vehicles
electric vehicle
charging
charging
emission vehicles
emission vehicles


In [52]:
matches = phrase_matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.sent)

This will include working with regulators to change regulatory frameworks; -New ˜nancing approaches such as Green Bonds, crowdsourcing and more; -Growing the ability to raise local investment through charging levies, business rates etc;  -Addressing high risk shortfalls in capital and revenue budgets which could undermine our resilience, and;  -Incentivising action through procurement and supply chains.
We will need signi˜cant new walking, cycling and public transport infrastructure, as well as charging infrastructure for electric vehicles, or other zero carbon fuels.
We will need signi˜cant new walking, cycling and public transport infrastructure, as well as charging infrastructure for electric vehicles, or other zero carbon fuels.
 -Development of a citywide plan for electric vehicle charging and hydrogen refuelling infrastructure and engage with the market.
 -Development of a citywide plan for electric vehicle charging and hydrogen refuelling infrastructure and engage with the marke

In [53]:
# for sent in doc.sents:
#     for match_id, start, end in phrase_matcher(nlp(sent.text)):
#         if nlp.vocab.strings[match_id] in ['EVs']:
#             print(sent.text)

In [6]:
data = {'tags': tags, 'tagged_sents': tagged_sentences}
df = pd.DataFrame(data = data)
df.head()

NameError: name 'tags' is not defined

## Seeing whether it's possible to download a website and do keyword extraction on it#

 Run conda install -c conda-forge scrapy!