# Topic Modelling for Corporate Sustainability Report

This notebook focuses on parsing the contents of the pdf and identifying the topics that are focused/talked about using spacy's nlp tool

In [1]:
import pandas as pd
import spacy
import re
import PyPDF2
from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
nlp = spacy.load('en')

In [17]:
# Function takes in a report and breaks it up into individual sentences
def convert_pdf_into_sentences(text):
    # Remove unnecessary spaces and line breaks
    text = re.sub(r'\x0c\x0c|\x0c', "", str(text))
    text = re.sub('\n ', '', str(text))
    text = re.sub('\n', ' ', str(text))
    text = ' '.join(text.split())
    text = " " + text + "  "
    text = text.replace("\n", " ")
    if "”" in text: text = text.replace(".”", "”.")
    if "\"" in text: text = text.replace(".\"", "\".")
    if "!" in text: text = text.replace("!\"", "\"!")
    if "?" in text: text = text.replace("?\"", "\"?")
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]

    # Filter for sentences with more than 100 characters
    sentences = [s.strip() for s in sentences if len(s) > 100]
    return sentences


def stopWordRemoval(sentences):
    sentencesCleaned = []
    for sentence in sentences:
        text_tokens = nlp(sentence)
        tokens_without_sw = [word.text for word in text_tokens if not word.text in nlp.Defaults.stop_words]
        sentencesCleaned.append(' '.join(tokens_without_sw))
    return sentencesCleaned

In [4]:
myFile = open('Apple_Environmental_Progress_Report_2021.pdf', mode = 'rb')
pdf_reader = PyPDF2.PdfFileReader(myFile)


# Read the entire file into a single string of texts
text = ""
for p in range(pdf_reader.numPages):
    page = pdf_reader.getPage(p)
    text += page.extractText()
myFile.close()



In [18]:
# Parse the large string into proper sentences
sentences = convert_pdf_into_sentences(text)

# Removing stop words
sentences_wo_sw = stopWordRemoval(sentences)

In [23]:
# Storing the sentences in a dataframe
df = pd.DataFrame(sentences_wo_sw, columns=['Sentence'])
df.head()

Unnamed: 0,Sentence
0,Covering fiscal year 2020 Environmental Progre...
1,Environmental Progress Report 2Introduction Cl...
2,"In year like , Apple continued work global net..."
3,"As company , moved ahead greater urgency creat..."
4,"Apple carbon neutral worldwide operations , co..."


# Topic Modelling with Latent Dirichlet Allocation

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [25]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(df['Sentence'])
dtm

<1223x2115 sparse matrix of type '<class 'numpy.longlong'>'
	with 17057 stored elements in Compressed Sparse Row format>

In [27]:
# random_state takes on an arbitrary value. Be sure to initialise it

# n_components: Number of topics we want. The more we have, the broader the range of topics 
# which makes it harder to analyse
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [None]:
LatentDirichletAllocation()

In [28]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

Showing the top few words per topic (i.e the sentences)

In [29]:
# Retrieving the top 10 words for each sentence and classify the sentences base on topic numbers 
# (we can vary accordingly)
def retrieveTopWords():
    wordTopics = {}
    for indx, topic in enumerate(LDA.components_):
        topWords = [cv.get_feature_names()[i] for i in topic.argsort()[-10:]]
        wordTopics[indx] = topWords
    return wordTopics

In [33]:
# Attaching all the information to our original dataframe
topic_results = LDA.transform(dtm)
df['topicNumber'] = topic_results.argmax(axis=1)


topWordsByTopic = retrieveTopWords()
# Map the results of the words to the sentences
df['topWords'] = df['topicNumber'].map(topWordsByTopic)

In [34]:
df.head(10)

Unnamed: 0,Sentence,topicNumber,topWords
0,Covering fiscal year 2020 Environmental Progre...,2,"[000, change, climate, environmental, appendix..."
1,Environmental Progress Report 2Introduction Cl...,2,"[000, change, climate, environmental, appendix..."
2,"In year like , Apple continued work global net...",0,"[100, footprint, climate, projects, percent, e..."
3,"As company , moved ahead greater urgency creat...",6,"[chemicals, safety, supply, recycled, supplier..."
4,"Apple carbon neutral worldwide operations , co...",0,"[100, footprint, climate, projects, percent, e..."
5,"Those products use recycled materials , like 4...",2,"[000, change, climate, environmental, appendix..."
6,"Well year , setting ambitious goals Apple help...",0,"[100, footprint, climate, projects, percent, e..."
7,That progress powered new renewable energy˜pro...,0,"[100, footprint, climate, projects, percent, e..."
8,While helped bring 4 gigawatts renewable energ...,1,"[efficiency, facilities, 2020, fiscal, use, ap..."
9,That included innovative new partnership Conse...,3,"[conservation, chemistries, impact, environmen..."


In [36]:
# Analyse the topics
df_topic_0 = df[df['topicNumber'] == 0]
df_topic_0.head()

Unnamed: 0,Sentence,topicNumber,topWords
2,"In year like , Apple continued work global net...",0,"[100, footprint, climate, projects, percent, e..."
4,"Apple carbon neutral worldwide operations , co...",0,"[100, footprint, climate, projects, percent, e..."
6,"Well year , setting ambitious goals Apple help...",0,"[100, footprint, climate, projects, percent, e..."
7,That progress powered new renewable energy˜pro...,0,"[100, footprint, climate, projects, percent, e..."
10,"At moment rebuilding , know help spur economic...",0,"[100, footprint, climate, projects, percent, e..."


In [38]:
# What is topic 1 about?
print(df_topic_0.iloc[0]['topWords'])

['100', 'footprint', 'climate', 'projects', 'percent', 'energy', 'apple', 'renewable', 'emissions', 'carbon']


## What are the top 10 words for each topic?

In [40]:
for index, topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic {index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic 0
['100', 'footprint', 'climate', 'projects', 'percent', 'energy', 'apple', 'renewable', 'emissions', 'carbon']


Top 10 words for topic 1
['efficiency', 'facilities', '2020', 'fiscal', 'use', 'apple', 'year', 'data', 'renewable', 'energy']


Top 10 words for topic 2
['000', 'change', 'climate', 'environmental', 'appendix', 'progress', 'report', 'smarter', 'chemistry', 'resources']


Top 10 words for topic 3
['conservation', 'chemistries', 'impact', 'environmental', 'materials', 'information', 'life', 'use', 'product', 'apple']


Top 10 words for topic 4
['use', 'local', 'industry', 'devices', 'percent', 've', 'solar', 'project', 'power', 'repair']


Top 10 words for topic 5
['new', 'cooling', 'local', 'air', 'waste', 'use', 'data', 'percent', 'sources', 'water']


Top 10 words for topic 6
['chemicals', 'safety', 'supply', 'recycled', 'suppliers', 'use', 'material', 'apple', 'materials', 'products']


