# Exporting representative documents per topic of dtm3 k =20

In [1]:
import logging
import random
import warnings
import tempfile
import os
from subprocess import PIPE
import numpy as np

from gensim import utils, corpora, matutils
from gensim.utils import check_output

logger = logging.getLogger(__name__)


class DtmModel(utils.SaveLoad):
    """Python wrapper using `DTM implementation <https://github.com/magsilva/dtm/tree/master/bin>`_.

    Communication between DTM and Python takes place by passing around data files on disk and executing
    the DTM binary as a subprocess.

    Warnings
    --------
    This is **only** python wrapper for `DTM implementation <https://github.com/magsilva/dtm/tree/master/bin>`_,
    you need to install original implementation first and pass the path to binary to ``dtm_path``.

    """
    def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100,
                 id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10,
                 alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True):
        """

        Parameters
        ----------
        dtm_path : str
            Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`.
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        time_slices : list of int
            Sequence of timestamps.
        mode : {'fit', 'time'}, optional
            Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time
            according to a DTM, basically a held out set.
        model : {'fixed', 'dtm'}, optional
            Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM.
        num_topics : int, optional
            Number of topics.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`.
        prefix : str, optional
            Prefix for produced temporary files.
        lda_sequence_min_iter : int, optional
             Min iteration of LDA.
        lda_sequence_max_iter : int, optional
            Max iteration of LDA.
        lda_max_em_iter : int, optional
             Max em optimization iterations in LDA.
        alpha : int, optional
            Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice.
        top_chain_var : float, optional
            This hyperparameter controls one of the key aspect of topic evolution which is the speed at which
            these topics evolve. A smaller top_chain_var leads to similar word distributions over multiple timeslice.

        rng_seed : int, optional
             Random seed.
        initialize_lda : bool, optional
             If True - initialize DTM with LDA.

        """
        if not os.path.isfile(dtm_path):
            raise ValueError("dtm_path must point to the binary file, not to a folder")

        self.dtm_path = dtm_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute DTM over an empty collection (no terms)")
        self.num_topics = num_topics

        try:
            lencorpus = len(corpus)
        except TypeError:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            raise ValueError("cannot compute DTM over an empty corpus")
        if model == "fixed" and any(not text for text in corpus):
            raise ValueError("""There is a text without words in the input corpus.
                    This breaks method='fixed' (The DIM model).""")
        if lencorpus != sum(time_slices):
            raise ValueError(
                "mismatched timeslices %{slices} for corpus of len {clen}"
                .format(slices=sum(time_slices), clen=lencorpus)
            )
        self.lencorpus = lencorpus
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)

        self.prefix = prefix
        self.time_slices = time_slices
        self.lda_sequence_min_iter = int(lda_sequence_min_iter)
        self.lda_sequence_max_iter = int(lda_sequence_max_iter)
        self.lda_max_em_iter = int(lda_max_em_iter)
        self.alpha = alpha
        self.top_chain_var = top_chain_var
        self.rng_seed = rng_seed
        self.initialize_lda = str(initialize_lda).lower()

        self.lambda_ = None
        self.obs_ = None
        self.lhood_ = None
        self.gamma_ = None
        self.init_alpha = None
        self.init_beta = None
        self.init_ss = None
        self.em_steps = []
        self.influences_time = []

        if corpus is not None:
            self.train(corpus, time_slices, mode, model)

    def fout_liklihoods(self):
        """Get path to temporary lhood data file.

        Returns
        -------
        str
            Path to lhood data file.

        """
        return self.prefix + 'train_out/lda-seq/' + 'lhoods.dat'

    def fout_gamma(self):
        """Get path to temporary gamma data file.

        Returns
        -------
        str
            Path to gamma data file.

        """
        return self.prefix + 'train_out/lda-seq/' + 'gam.dat'

    def fout_prob(self):
        """Get template of path to temporary file.

        Returns
        -------
        str
            Path to file.

        """
        return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-e-log-prob.dat'

    def fout_observations(self):
        """Get template of path to temporary file.

        Returns
        -------
        str
            Path to file.

        """
        return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-obs.dat'

    def fout_influence(self):
        """Get template of path to temporary file.

        Returns
        -------
        str
            Path to file.

        """
        return self.prefix + 'train_out/lda-seq/' + 'influence_time-{i}'

    def foutname(self):
        """Get path to temporary file.

        Returns
        -------
        str
            Path to file.

        """
        return self.prefix + 'train_out'

    def fem_steps(self):
        """Get path to temporary em_step data file.

        Returns
        -------
        str
            Path to em_step data file.

        """
        return self.prefix + 'train_out/' + 'em_log.dat'

    def finit_alpha(self):
        """Get path to initially trained lda alpha file.

        Returns
        -------
        str
            Path to initially trained lda alpha file.

        """
        return self.prefix + 'train_out/' + 'initial-lda.alpha'

    def finit_beta(self):
        """Get path to initially trained lda beta file.

        Returns
        -------
        str
            Path to initially trained lda beta file.

        """
        return self.prefix + 'train_out/' + 'initial-lda.beta'

    def flda_ss(self):
        """Get path to initial lda binary file.

        Returns
        -------
        str
            Path to initial lda binary file.

        """
        return self.prefix + 'train_out/' + 'initial-lda-ss.dat'

    def fcorpustxt(self):
        """Get path to temporary file.

        Returns
        -------
        str
            Path to multiple train binary file.

        """
        return self.prefix + 'train-mult.dat'

    def fcorpus(self):
        """Get path to corpus file.

        Returns
        -------
        str
            Path to corpus file.

        """
        return self.prefix + 'train'

    def ftimeslices(self):
        """Get path to time slices binary file.

        Returns
        -------
        str
            Path to time slices binary file.

        """
        return self.prefix + 'train-seq.dat'

    def convert_input(self, corpus, time_slices):
        """Convert corpus into LDA-C format by :class:`~gensim.corpora.bleicorpus.BleiCorpus` and save to temp file.
        Path to temporary file produced by :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.ftimeslices`.

        Parameters
        ----------
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        time_slices : list of int
            Sequence of timestamps.

        """
        logger.info("serializing temporary corpus to %s", self.fcorpustxt())
        # write out the corpus in a file format that DTM understands:
        corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)

        with utils.open(self.ftimeslices(), 'wb') as fout:
            fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
            for sl in time_slices:
                fout.write(utils.to_utf8(str(sl) + "\n"))

    def train(self, corpus, time_slices, mode, model):
        """Train DTM model.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        time_slices : list of int
            Sequence of timestamps.
        mode : {'fit', 'time'}, optional
            Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time
            according to a DTM, basically a held out set.
        model : {'fixed', 'dtm'}, optional
            Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM.

        """
        self.convert_input(corpus, time_slices)

        arguments = \
            "--ntopics={p0} --model={mofrl}  --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \
            "--outname={p4} --alpha={p5}".format(
                p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda,
                p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha
            )

        params = \
            "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1}  --lda_sequence_max_iter={p2} " \
            "--top_chain_var={p3} --rng_seed={p4} ".format(
                p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter,
                p3=self.top_chain_var, p4=self.rng_seed
            )

        arguments = arguments + " " + params
        logger.info("training DTM with args %s", arguments)

        cmd = [self.dtm_path] + arguments.split()
        logger.info("Running command %s", cmd)
        check_output(args=cmd, stderr=PIPE)

        self.em_steps = np.loadtxt(self.fem_steps())
        self.init_ss = np.loadtxt(self.flda_ss())

        if self.initialize_lda:
            self.init_alpha = np.loadtxt(self.finit_alpha())
            self.init_beta = np.loadtxt(self.finit_beta())

        self.lhood_ = np.loadtxt(self.fout_liklihoods())

        # document-topic proportions
        self.gamma_ = np.loadtxt(self.fout_gamma())
        # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic
        # in doc 5
        self.gamma_.shape = (self.lencorpus, self.num_topics)
        # normalize proportions
        self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis]

        self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))
        self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))

        for t in range(self.num_topics):
            topic = "%03d" % t
            self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic))
            self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic))
        # cast to correct shape, lambda[5,10,0] is the proportion of the 10th
        # topic in doc 5 at time 0
        self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        # extract document influence on topics for each time slice
        # influences_time[0] , influences at time 0
        if model == 'fixed':
            for k, t in enumerate(self.time_slices):
                stamp = "%03d" % k
                influence = np.loadtxt(self.fout_influence().format(i=stamp))
                influence.shape = (t, self.num_topics)
                # influence[2,5] influence of document 2 on topic 5
                self.influences_time.append(influence)

    def print_topics(self, num_topics=10, times=5, num_words=10):
        """Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.show_topics`.

        Parameters
        ----------
        num_topics : int, optional
            Number of topics to return, set `-1` to get all topics.
        times : int, optional
            Number of times.
        num_words : int, optional
            Number of words.

        Returns
        -------
        list of str
            Topics as a list of strings

        """
        return self.show_topics(num_topics, times, num_words, log=True)

    def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted=True):
        """Get the `num_words` most probable words for `num_topics` number of topics at 'times' time slices.

        Parameters
        ----------
        num_topics : int, optional
            Number of topics to return, set `-1` to get all topics.
        times : int, optional
            Number of times.
        num_words : int, optional
            Number of words.
        log : bool, optional
            THIS PARAMETER WILL BE IGNORED.
        formatted : bool, optional
            If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs.

        Returns
        -------
        list of str
            Topics as a list of strings (if formatted=True) **OR**
        list of (float, str)
            Topics as list of (weight, word) pairs (if formatted=False)

        """
        if num_topics < 0 or num_topics >= self.num_topics:
            num_topics = self.num_topics
            chosen_topics = range(num_topics)
        else:
            num_topics = min(num_topics, self.num_topics)
            chosen_topics = range(num_topics)

        if times < 0 or times >= len(self.time_slices):
            times = len(self.time_slices)
            chosen_times = range(times)
        else:
            times = min(times, len(self.time_slices))
            chosen_times = range(times)

        shown = []
        for time in chosen_times:
            for i in chosen_topics:
                if formatted:
                    topic = self.print_topic(i, time, topn=num_words)
                else:
                    topic = self.show_topic(i, time, topn=num_words)
                shown.append(topic)
        return shown

    def show_topic(self, topicid, time, topn=50, num_words=None):
        """Get `num_words` most probable words for the given `topicid`.

        Parameters
        ----------
        topicid : int
            Id of topic.
        time : int
            Timestamp.
        topn : int, optional
            Top number of topics that you'll receive.
        num_words : int, optional
            DEPRECATED PARAMETER, use `topn` instead.

        Returns
        -------
        list of (float, str)
            Sequence of probable words, as a list of `(word_probability, word)`.

        """
        if num_words is not None:  # deprecated num_words is used
            warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.")
            topn = num_words

        topics = self.lambda_[:, :, time]
        topic = topics[topicid]
        # likelihood to probability
        topic = np.exp(topic)
        # normalize to probability dist
        topic = topic / topic.sum()
        # sort according to prob
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(topic[idx], self.id2word[idx]) for idx in bestn]
        return beststr

    def print_topic(self, topicid, time, topn=10, num_words=None):
        """Get the given topic, formatted as a string.

        Parameters
        ----------
        topicid : int
            Id of topic.
        time : int
            Timestamp.
        topn : int, optional
            Top number of topics that you'll receive.
        num_words : int, optional
            DEPRECATED PARAMETER, use `topn` instead.

        Returns
        -------
        str
            The given topic in string format, like '0.132*someword + 0.412*otherword + ...'.

        """
        if num_words is not None:  # deprecated num_words is used
            warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.")
            topn = num_words

        return ' + '.join('%.3f*%s' % v for v in self.show_topic(topicid, time, topn=topn))

    def dtm_vis(self, corpus, time):
        """Get data specified by pyLDAvis format.

        Parameters
        ----------
        corpus : iterable of iterable of (int, float)
            Collection of texts in BoW format.
        time : int
            Sequence of timestamp.

        Notes
        -----
        All of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis.

        Returns
        -------
        doc_topic : numpy.ndarray
            Document-topic proportions.
        topic_term : numpy.ndarray
            Calculated term of topic suitable for pyLDAvis format.
        doc_lengths : list of int
            Length of each documents in corpus.
        term_frequency : numpy.ndarray
            Frequency of each word from vocab.
        vocab : list of str
            List of words from docpus.

        """
        topic_term = np.exp(self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum()
        topic_term *= self.num_topics

        doc_topic = self.gamma_

        doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)]

        term_frequency = np.zeros(len(self.id2word))
        for doc_no, doc in enumerate(corpus):
            for pair in doc:
                term_frequency[pair[0]] += pair[1]

        vocab = [self.id2word[i] for i in range(0, len(self.id2word))]
        # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency.
        # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics.
        return doc_topic, topic_term, doc_lengths, term_frequency, vocab

    def dtm_coherence(self, time, num_words=20):
        """Get all topics of a particular time-slice without probability values for it to be used.
        For either "u_mass" or "c_v" coherence.

        Parameters
        ----------
        num_words : int
            Number of words.
        time : int
            Timestamp

        Returns
        -------
        coherence_topics : list of list of str
            All topics of a particular time-slice without probability values for it to be used.

        Warnings
        --------
        TODO: because of print format right now can only return for 1st time-slice, should we fix the coherence
        printing or make changes to the print statements to mirror DTM python?

        """
        coherence_topics = []
        for topic_no in range(0, self.num_topics):
            topic = self.show_topic(topicid=topic_no, time=time, num_words=num_words)
            coherence_topic = []
            for prob, word in topic:
                coherence_topic.append(word)
            coherence_topics.append(coherence_topic)

        return coherence_topics

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import numpy as np
import pickle
import pandas as pd
import csv
from docx import Document

# Load the model from disk
print("Loading model from disk...")
with open('dtm_model_subset_100_percent.pkl', 'rb') as file:
    model = pickle.load(file)
print("Model loaded.")

# Load the preprocessed dataset
print("Loading data...")
df = pd.read_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Data/coal_data_preprocessed.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Loop through each topic ID and create documents for each
for topic_id in range(model.gamma_.shape[1]):
    # Add a new column to the dataframe for the topic share
    df[f'Topic{topic_id}_Share'] = model.gamma_[:, topic_id]

    # Get the top 40 articles with the highest share of the current topic
    top_articles = df.nlargest(40, f'Topic{topic_id}_Share')

    # Display the articles with the highest topic share for the current topic
    print(f"Top 10 articles with highest topic share for topic {topic_id}:")
    print(top_articles)

    # Save the top articles to a new CSV file
    top_articles.to_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_40_articles_topic_{topic_id}.csv', index=False)

    # Display the top 10 words for these articles for further analysis
    top_words = top_articles['Processed_Article'].apply(lambda x: x.split()).explode().value_counts().head(40)
    print(f"Top 10 words in top articles for topic {topic_id}:")
    print(top_words)

    # Create a Word document
    doc = Document()

    # Add a title to the document
    doc.add_heading(f'Top 40 Articles for Topic {topic_id}', 0)

    # Loop through the top articles and add them to the document
    for index, row in top_articles.iterrows():
        doc.add_heading(f"Article ID: {row['Unnamed: 0']}", level=1)
        doc.add_paragraph(f"Date: {row['Date']}")
        doc.add_paragraph(f"Title: {row['Title']}")
        doc.add_paragraph(f"Newspaper Outlet: {row['News Outlet']}")
        doc.add_paragraph(f"Topic Share: {row[f'Topic{topic_id}_Share']:.2f}")
        doc.add_paragraph("Article Text:")
        doc.add_paragraph(row['Article_Text'])
        doc.add_page_break()

    # Save the document
    doc.save(f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_40_articles_topic_{topic_id}.docx')

    print(f"Word document with top articles for topic {topic_id} created successfully.")





Loading model from disk...
Model loaded.
Loading data...
Top 10 articles with highest topic share for topic 0:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
6103         18173        7502                International       NaN   
4877         14865       14641                International       NaN   
6121         18213        8677                International       NaN   
7214         19554       43232                International       NaN   
4881         14872       15227                International       NaN   
6118         18206        8444                International       NaN   
5712         17136       49674                International       NaN   
3111         11094        5470                International       NaN   
5509         16709       32580                International       NaN   
5307         16413       22524                International       NaN   
7121         19365       25413                International       NaN   
4028         

Top 10 words in top articles for topic 1:
climat       416
africa       275
south        262
countri      249
chang        231
energi       196
emiss        189
transit      172
xa           162
carbon       156
develop      144
cop          124
global       119
agreement    112
commit       108
nation       100
need          87
financ        82
plan          82
govern        77
economi       76
sa            74
world         73
ga            71
support       69
invest        69
report        67
target        66
fund          65
action        64
power         63
fuel          62
greenhous     62
bank          62
renew         61
fossil        60
negoti        59
risk          58
reduc         56
confer        56
Name: Processed_Article, dtype: int64
Word document with top articles for topic 1 created successfully.
Top 10 articles with highest topic share for topic 2:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
3479         11970       20310                I

Top 10 words in top articles for topic 2:
share        115
mine         113
compani      103
yesterday     79
asset         74
sharehold     66
staff         61
writer        60
south         53
resourc       51
hold          48
optimum       43
project       43
offer         42
price         40
group         40
bn            39
deal          38
market        38
african       38
glencor       37
invest        36
list          36
fund          36
anglo         33
acquisit      33
develop       33
busi          32
end           31
properti      31
oper          31
africa        31
capit         29
valu          29
own           28
result        28
produc        27
announc       27
plan          26
buy           25
Name: Processed_Article, dtype: int64
Word document with top articles for topic 2 created successfully.
Top 10 articles with highest topic share for topic 3:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
204            359       80390                I

Word document with top articles for topic 3 created successfully.
Top 10 articles with highest topic share for topic 4:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
5398         16551       27224                International       NaN   
1972          5532       31630                International       NaN   
6463         18598       42363                International       NaN   
4029         13387        4536                International       NaN   
1866          5408       27213                International       NaN   
7172         19469       35159                International       NaN   
303            474        4794                International       NaN   
92             169       29358                International       NaN   
2973         10871        2190                International       NaN   
1295          4007        2272                International       NaN   
1736          5245       23601                International       NaN   
7079

Word document with top articles for topic 4 created successfully.
Top 10 articles with highest topic share for topic 5:
      Unnamed: 0.1  Unnamed: 0             Jurisdiction  Location ContentType  \
2711         10330       60349            International       NaN        News   
3582         12132       26589            International       NaN        News   
7025         19232       10462            International       NaN        News   
2678         10242       55762            International       NaN        News   
2710         10329       60348            International       NaN        News   
7353         19826       65471            International       NaN        News   
4919         14942       30380            International       NaN        News   
2831         10579       75573            International       NaN        News   
2714         10333       60373            International       NaN        News   
2724         10352       60852            International       NaN     

Top 10 words in top articles for topic 6:
xa              116
team             91
busi             86
leagu            86
nation           81
game             79
play             74
player           68
ladi             67
nyda             66
coach            63
school           62
cape             60
youth            57
stadium          55
thunderbird      51
footbal          49
match            46
women            45
start            45
club             43
citi             38
develop          37
sundown          37
univers          36
banyana          36
win              35
season           35
championship     34
manag            34
high             34
south            33
young            33
challeng         32
pupil            32
place            31
back             31
fund             31
us               31
grant            31
Name: Processed_Article, dtype: int64
Word document with top articles for topic 6 created successfully.
Top 10 articles with highest topic share for topic 7:


Word document with top articles for topic 7 created successfully.
Top 10 articles with highest topic share for topic 8:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
2228          6989       59387                International       NaN   
7126         19385       27330                International       NaN   
6535         18684       51312                International       NaN   
3796         12724       58677                International       NaN   
6572         18724       55976                International       NaN   
6637         18796       59737                International       NaN   
7299         19676       55659                International       NaN   
5051         15133       55719                International       NaN   
1574          4864       17407                International       NaN   
2186          5804       41534                International       NaN   
2374          7428       81403                International       NaN   
5138

Word document with top articles for topic 8 created successfully.
Top 10 articles with highest topic share for topic 9:
      Unnamed: 0.1  Unnamed: 0   Jurisdiction  Location ContentType  \
3020         10943        3548  International       NaN        News   
3160         11240        7276  International       NaN        News   
3666         12367       42615  International       NaN        News   
3789         12715       58228  International       NaN        News   
3424         11896       18718  International       NaN        News   
3112         11100        5669  International       NaN        News   
3093         11054        5105  International       NaN        News   
3167         11261        7708  International       NaN        News   
3091         11043        4965  International       NaN        News   
5553         16787       36287  International       NaN        News   
719            970       27778  International       NaN        News   
3736         12610       519

Top 10 articles with highest topic share for topic 10:
      Unnamed: 0.1  Unnamed: 0                           Jurisdiction  \
7804         22974       62705                          International   
7790         22954       59182              International; Tamil Nadu   
7948         23296       48825                          International   
7921         23264       37465             International; Maharashtra   
7900         23241       26117                          International   
7964         23314       57713              International; Tamil Nadu   
5091         15189       62082                          International   
7646         22714       26164                          International   
7782         22941       57664                          International   
7533         22490       27191                          International   
268            436        3106                          International   
7880         23215        4010                          International

Word document with top articles for topic 10 created successfully.
Top 10 articles with highest topic share for topic 11:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
3788         12713       58066                International       NaN   
6413         18546       38768                International       NaN   
3586         12146       27562                International       NaN   
5887         17434       63240                International       NaN   
6606         18764       58215  U.S. Federal; International       NaN   
3770         12681       56536                International       NaN   
5571         16810       37617  U.S. Federal; International       NaN   
5411         16567       27616                International       NaN   
6301         18422       26867                International       NaN   
1345          4094        5068  U.S. Federal; International       NaN   
3591         12152       27775                International       NaN   
34

Word document with top articles for topic 11 created successfully.
Top 10 articles with highest topic share for topic 12:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
4856         14833        9004  U.S. Federal; International       NaN   
4956         15004       37032                International       NaN   
3358         11812       16971  U.S. Federal; International       NaN   
7194         19512       39563  U.S. Federal; International       NaN   
4948         14995       36575  U.S. Federal; International       NaN   
2600         10058       43370  U.S. Federal; International       NaN   
3547         12057       22262                International       NaN   
1598          5072       19512  U.S. Federal; International       NaN   
2559          9861       22711  U.S. Federal; International       NaN   
4946         14993       36492                International       NaN   
6131         18229        8977                International       NaN   
49

Top 10 words in top articles for topic 13:
africa           377
south            317
trade            271
countri          209
invest           195
sa               195
african          178
export           155
bric             146
develop          141
busi             138
market           132
china            129
bn               129
growth           125
industri         121
econom           113
economi          112
sector           102
govern            97
compani           96
bank              91
import            88
product           86
global            85
opportun          80
manufactur        68
state             63
oper              61
increas           60
world             60
good              57
intern            55
infrastructur     54
grow              53
chines            53
local             52
contin            52
foreign           52
partner           50
Name: Processed_Article, dtype: int64
Word document with top articles for topic 13 created successfully.
Top 10 artic

Top 10 words in top articles for topic 14:
climat        367
chang         312
water         200
africa        164
south         162
peopl         132
world         126
global        106
citi           95
warm           91
countri        84
increas        81
need           79
energi         74
temperatur     72
plastic        71
report         71
wast           69
level          68
food           67
get            63
cape           61
recycl         60
product        58
drought        58
system         57
emiss          57
human          57
weather        55
impact         55
flood          54
power          53
work           51
environ        50
industri       50
commun         50
mani           50
part           50
pollut         49
live           49
Name: Processed_Article, dtype: int64
Word document with top articles for topic 14 created successfully.
Top 10 articles with highest topic share for topic 15:
      Unnamed: 0.1  Unnamed: 0                             Jurisdiction  \
49

Word document with top articles for topic 15 created successfully.
Top 10 articles with highest topic share for topic 16:
      Unnamed: 0.1  Unnamed: 0                         Jurisdiction  Location  \
7958         23308       55097                        International       NaN   
7965         23316       57782                        International       NaN   
2531          9789       11775                        International       NaN   
7644         22712       24821                        International       NaN   
5530         16742       34521                        International       NaN   
2425          9601         245               Florida; International       NaN   
3659         12338       40825                        International       NaN   
3754         12650       54971                        International       NaN   
2535          9797       13010                        International       NaN   
1149          1947       78228                        International 

Word document with top articles for topic 16 created successfully.
Top 10 articles with highest topic share for topic 17:
      Unnamed: 0.1  Unnamed: 0  \
6073         18094        5629   
6078         18101        5927   
3397         11860       18041   
2484          9704        5608   
6027         17975        2369   
492            681       13157   
6083         18113        6284   
4191         13563       12777   
3241         11480       10507   
6137         18241        9257   
6089         18143        6655   
6079         18102        6007   
5811         17315       58182   
2260          7281       72856   
1329          4067        4128   
601            806       18250   
6080         18110        6225   
6257         18375       22232   
1381          4167        6752   
4201         13657       14555   
720            971       27864   
2198          5817       41879   
5391         16541       26863   
522            714       14697   
7680         22765       334

Top 10 words in top articles for topic 18:
mine          598
miner         272
industri      234
women         165
compani       148
state         141
resourc       127
right         108
develop       106
govern         99
need           89
sector         88
south          86
depart         85
act            76
sa             75
nationalis     72
chang          65
invest         65
africa         63
african        62
xa             59
bill           54
polici         50
nation         50
support        48
process        48
propos         47
tax            47
mr             46
issu           46
expropri       45
minist         45
anc            44
system         44
amend          44
investor       43
work           42
law            42
busi           41
Name: Processed_Article, dtype: int64
Word document with top articles for topic 18 created successfully.
Top 10 articles with highest topic share for topic 19:
      Unnamed: 0.1  Unnamed: 0                 Jurisdiction  Location  \
2426

Top 10 words in top articles for topic 19:
port             327
transnet         161
ton              151
termin           142
export           129
bay              128
rail             125
richard          108
capac            106
durban            98
line              89
contain           82
handl             81
maputo            67
increas           59
plan              58
cargo             56
ship              55
freight           52
invest            51
train             48
compani           47
oper              47
rbct              43
tfr               43
south             43
infrastructur     41
sa                40
africa            40
improv            38
project           37
volum             36
cape              36
bn                35
expans            35
manag             35
east              33
transport         32
busi              32
servic            30
Name: Processed_Article, dtype: int64
Word document with top articles for topic 19 created successfully.


In [2]:
# Extract top articles for spike periods:

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from docx import Document

# Load the model from disk
print("Loading model from disk...")
with open('dtm_model_subset_100_percent.pkl', 'rb') as file:
    model = pickle.load(file)
print("Model loaded.")

# Load the preprocessed dataset
print("Loading data...")
df = pd.read_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Data/coal_data_preprocessed.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Define topics and their corresponding topic IDs and date ranges
topics = {
    "anti-mining activism": {
        "id": 9,
        "date_ranges": [
            ("2017-08-01", "2018-03-31")
        ]
    },
    "climate change impact": {
        "id": 14,
        "date_ranges": [
            ("2017-08-01", "2018-01-31")
        ]
    },
    "state capture and corruption": {
        "id": 7,
        "date_ranges": [
            ("2017-08-01", "2018-02-28"),
            ("2020-08-01", "2021-02-28")
        ]
    },
    "energy policy": {
        "id": 4,
        "date_ranges": [
            ("2020-06-01", "2021-02-28")
        ]
    },
    "transition": {
        "id": 1,
        "date_ranges": [
            ("2021-06-01", "2022-06-28")
        ]
    }
}

# Function to extract and save top articles for a given topic and date range
def extract_and_save_articles(topic_name, topic_id, start_date, end_date):
    # Filter data for the date range
    date_range = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    df_filtered = df[date_range]
    
    # Add a new column to the dataframe for the topic share
    df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]
    
    # Get the top 20 articles with the highest share of the topic
    top_articles = df_filtered.nlargest(20, f'Topic{topic_id}_Share')
    
    # Save the top articles to a new CSV file
    csv_filename = f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_{topic_name.replace(" ", "_")}_{start_date}_to_{end_date}.csv'
    top_articles.to_csv(csv_filename, index=False)
    
    # Create a Word document
    doc = Document()
    
    # Add a title to the document
    doc.add_heading(f'Top 20 Articles for {topic_name} ({start_date} to {end_date})', 0)
    
    # Loop through the top articles and add them to the document
    for index, row in top_articles.iterrows():
        doc.add_heading(f"Article ID: {row['Unnamed: 0']}", level=1)
        doc.add_paragraph(f"Date: {row['Date']}")
        doc.add_paragraph(f"Title: {row['Title']}")
        doc.add_paragraph(f"Newspaper Outlet: {row['News Outlet']}")
        doc.add_paragraph(f"Topic Share: {row[f'Topic{topic_id}_Share']:.2f}")
        doc.add_paragraph("Article Text:")
        doc.add_paragraph(row['Article_Text'])
        doc.add_page_break()
    
    # Save the document
    doc_filename = f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_{topic_name.replace(" ", "_")}_{start_date}_to_{end_date}.docx'
    doc.save(doc_filename)
    
    print(f"Articles for '{topic_name}' ({start_date} to {end_date}) saved to {csv_filename} and {doc_filename}")

# Extract and save articles for each topic and date range
for topic_name, topic_info in topics.items():
    topic_id = topic_info["id"]
    for date_range in topic_info["date_ranges"]:
        start_date, end_date = date_range
        extract_and_save_articles(topic_name, topic_id, start_date, end_date)

print("All articles extracted and saved successfully.")


Loading model from disk...
Model loaded.
Loading data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]


Articles for 'anti-mining activism' (2017-08-01 to 2018-03-31) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_anti-mining_activism_2017-08-01_to_2018-03-31.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_anti-mining_activism_2017-08-01_to_2018-03-31.docx
Articles for 'climate change impact' (2017-08-01 to 2018-01-31) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_climate_change_impact_2017-08-01_to_2018-01-31.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_climate_change_impact_2017-08-01_to_2018-01-31.docx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]


Articles for 'state capture and corruption' (2017-08-01 to 2018-02-28) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_state_capture_and_corruption_2017-08-01_to_2018-02-28.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_state_capture_and_corruption_2017-08-01_to_2018-02-28.docx
Articles for 'state capture and corruption' (2020-08-01 to 2021-02-28) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_state_capture_and_corruption_2020-08-01_to_2021-02-28.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_state_capture_and_corruption_2020-08-01_to_2021-02-28.docx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]


Articles for 'energy policy' (2020-06-01 to 2021-02-28) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_energy_policy_2020-06-01_to_2021-02-28.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_energy_policy_2020-06-01_to_2021-02-28.docx
Articles for 'transition' (2021-06-01 to 2022-06-28) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_transition_2021-06-01_to_2022-06-28.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_transition_2021-06-01_to_2022-06-28.docx
All articles extracted and saved successfully.


In [8]:
# Extract all articles from 2020 containing the word "renewables"


print("Loading data...")
df = pd.read_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Data/coal_data_preprocessed.csv')
df['Date'] = pd.to_datetime(df['Date'])

filtered_df = df[(df['Date'].dt.year == 2020) & (df['Article_Text'].str.contains('renewable', case=False, na=False))]

print(f"Found {len(filtered_df)} articles containing the word 'renewable' from the year 2020:")
print(filtered_df[['Date', 'Article_Text']])

# Save the filtered articles to a new CSV file (optional)
filtered_df.to_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/filtered_articles_2020_renewables.csv', index=False)

Loading data...
Found 58 articles containing the word 'renewable' from the year 2020:
                          Date  \
963  2020-01-26 00:00:00+00:00   
964  2020-02-02 00:00:00+00:00   
965  2020-02-09 00:00:00+00:00   
967  2020-02-16 00:00:00+00:00   
971  2020-03-01 00:00:00+00:00   
979  2020-07-12 00:00:00+00:00   
990  2020-11-08 00:00:00+00:00   
993  2020-11-15 00:00:00+00:00   
2234 2020-02-06 00:00:00+00:00   
2235 2020-02-18 00:00:00+00:00   
2236 2020-03-13 00:00:00+00:00   
2733 2020-02-13 00:00:00+00:00   
2735 2020-02-14 00:00:00+00:00   
2736 2020-02-14 00:00:00+00:00   
2737 2020-03-05 00:00:00+00:00   
2742 2020-03-16 00:00:00+00:00   
2763 2020-10-20 00:00:00+00:00   
2765 2020-11-05 00:00:00+00:00   
2766 2020-11-12 00:00:00+00:00   
2767 2020-11-26 00:00:00+00:00   
3840 2020-05-19 00:00:00+00:00   
3841 2020-06-04 00:00:00+00:00   
3844 2020-07-31 00:00:00+00:00   
4541 2020-02-06 00:00:00+00:00   
4542 2020-02-06 00:00:00+00:00   
4548 2020-04-16 00:00:00+00:00

In [19]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from docx import Document

# Load the model from disk
print("Loading model from disk...")
with open('dtm_model_subset_100_percent.pkl', 'rb') as file:
    model = pickle.load(file)
print("Model loaded.")

# Load the preprocessed dataset
print("Loading data...")
df = pd.read_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Data/coal_data_preprocessed.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Define topics and their corresponding topic IDs and date ranges
topics = {
    "Eskom crisis2": {
        "id": 8,
        "date_ranges": [
            ("2021-01-01", "2021-12-31")
        ]
        }
}

# Function to extract and save top articles for a given topic and date range
def extract_and_save_articles(topic_name, topic_id, start_date, end_date):
    # Filter data for the date range
    date_range = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    df_filtered = df[date_range]
    
    # Add a new column to the dataframe for the topic share
    df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]
    
    # Get the top 20 articles with the highest share of the topic
    top_articles = df_filtered.nlargest(100, f'Topic{topic_id}_Share')
    
    # Save the top articles to a new CSV file
    csv_filename = f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_{topic_name.replace(" ", "_")}_{start_date}_to_{end_date}.csv'
    top_articles.to_csv(csv_filename, index=False)
    
    # Create a Word document
    doc = Document()
    
    # Add a title to the document
    doc.add_heading(f'Top 20 Articles for {topic_name} ({start_date} to {end_date})', 0)
    
    # Loop through the top articles and add them to the document
    for index, row in top_articles.iterrows():
        doc.add_heading(f"Article ID: {row['Unnamed: 0']}", level=1)
        doc.add_paragraph(f"Date: {row['Date']}")
        doc.add_paragraph(f"Title: {row['Title']}")
        doc.add_paragraph(f"Newspaper Outlet: {row['News Outlet']}")
        doc.add_paragraph(f"Topic Share: {row[f'Topic{topic_id}_Share']:.2f}")
        doc.add_paragraph("Article Text:")
        doc.add_paragraph(row['Article_Text'])
        doc.add_page_break()
    
    # Save the document
    doc_filename = f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_{topic_name.replace(" ", "_")}_{start_date}_to_{end_date}.docx'
    doc.save(doc_filename)
    
    print(f"Articles for '{topic_name}' ({start_date} to {end_date}) saved to {csv_filename} and {doc_filename}")

# Extract and save articles for each topic and date range
for topic_name, topic_info in topics.items():
    topic_id = topic_info["id"]
    for date_range in topic_info["date_ranges"]:
        start_date, end_date = date_range
        extract_and_save_articles(topic_name, topic_id, start_date, end_date)

print("All articles extracted and saved successfully.")


Loading model from disk...
Model loaded.
Loading data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[f'Topic{topic_id}_Share'] = model.gamma_[df_filtered.index, topic_id]


Articles for 'Eskom crisis2' (2021-01-01 to 2021-12-31) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_Eskom_crisis2_2021-01-01_to_2021-12-31.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_20_articles_Eskom_crisis2_2021-01-01_to_2021-12-31.docx
All articles extracted and saved successfully.


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from docx import Document

# Load the model from disk
print("Loading model from disk...")
with open('dtm_model_subset_100_percent.pkl', 'rb') as file:
    model = pickle.load(file)
print("Model loaded.")

# Load the preprocessed dataset
print("Loading data...")
df = pd.read_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Data/coal_data_preprocessed.csv')
df['Date'] = pd.to_datetime(df['Date'])


keywords = ["COP26", "just energy transition partnership"]
date_ranges = [("2020-01-01", "2022-12-31")]

# Function to extract and save articles containing specific keywords within a date range
def extract_and_save_articles(keyword, start_date, end_date):
    # Filter data for the date range
    date_range = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    df_filtered = df[date_range]
    
    # Filter for articles containing the keyword
    df_filtered = df_filtered[df_filtered['Article_Text'].str.contains(keyword, case=False, na=False)]
    
    # Get the top 100 articles
    top_articles = df_filtered.nlargest(100, 'Date')
    
    # Save the top articles to a new CSV file
    csv_filename = f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_100_articles_{keyword.replace(" ", "_")}_{start_date}_to_{end_date}.csv'
    top_articles.to_csv(csv_filename, index=False)
    
    # Create a Word document
    doc = Document()
    
    # Add a title to the document
    doc.add_heading(f'Top 100 Articles for {keyword} ({start_date} to {end_date})', 0)
    
    # Loop through the top articles and add them to the document
    for index, row in top_articles.iterrows():
        doc.add_heading(f"Article ID: {row['Unnamed: 0']}", level=1)
        doc.add_paragraph(f"Date: {row['Date']}")
        doc.add_paragraph(f"Title: {row['Title']}")
        doc.add_paragraph(f"Newspaper Outlet: {row['News Outlet']}")
        doc.add_paragraph("Article Text:")
        doc.add_paragraph(row['Article_Text'])
        doc.add_page_break()
    
    # Save the document
    doc_filename = f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_100_articles_{keyword.replace(" ", "_")}_{start_date}_to_{end_date}.docx'
    doc.save(doc_filename)
    
    print(f"Articles containing '{keyword}' ({start_date} to {end_date}) saved to {csv_filename} and {doc_filename}")

# Extract and save articles for each keyword and date range
for keyword in keywords:
    for start_date, end_date in date_ranges:
        extract_and_save_articles(keyword, start_date, end_date)

print("All articles extracted and saved successfully.")


Loading model from disk...
Model loaded.
Loading data...
Articles containing 'COP26' (2020-01-01 to 2022-12-31) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_100_articles_COP26_2020-01-01_to_2022-12-31.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_100_articles_COP26_2020-01-01_to_2022-12-31.docx
Articles containing 'just energy transition partnership' (2020-01-01 to 2022-12-31) saved to /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_100_articles_just_energy_transition_partnership_2020-01-01_to_2022-12-31.csv and /Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/top_100_articles_just_energy_transition_partnership_2020-01-01_to_2022-12-31.docx
All articles extracted and saved successfully.


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from docx import Document

# Load the model from disk
print("Loading model from disk...")
with open('dtm_model_subset_100_percent.pkl', 'rb') as file:
    model = pickle.load(file)
print("Model loaded.")

# Load the preprocessed dataset
print("Loading data...")
df = pd.read_csv('/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Data/coal_data_preprocessed.csv')
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year

# Loop through each topic ID and create documents for each
for topic_id in range(model.gamma_.shape[1]):
    # Add a new column to the dataframe for the topic share
    df[f'Topic{topic_id}_Share'] = model.gamma_[:, topic_id]

    # Create an empty dataframe to hold the most representative articles for each year
    most_representative_articles = pd.DataFrame()

    # Loop through each year and find the most representative article
    for year in df['Year'].unique():
        df_year = df[df['Year'] == year]
        if not df_year.empty:
            # Get the article with the highest topic share for the current year
            most_representative_article = df_year.loc[df_year[f'Topic{topic_id}_Share'].idxmax()]
            most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)
    
    # Save the most representative articles to a new CSV file
    most_representative_articles.to_csv(f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/most_representative_articles_topic_{topic_id}.csv', index=False)

    # Create a Word document
    doc = Document()

    # Add a title to the document
    doc.add_heading(f'Most Representative Articles for Topic {topic_id} by Year', 0)

    # Loop through the most representative articles and add them to the document
    for index, row in most_representative_articles.iterrows():
        doc.add_heading(f"Year: {row['Year']} - Article ID: {row['Unnamed: 0']}", level=1)
        doc.add_paragraph(f"Date: {row['Date']}")
        doc.add_paragraph(f"Title: {row['Title']}")
        doc.add_paragraph(f"Newspaper Outlet: {row['News Outlet']}")
        doc.add_paragraph(f"Topic Share: {row[f'Topic{topic_id}_Share']:.2f}")
        doc.add_paragraph("Article Text:")
        doc.add_paragraph(row['Article_Text'])
        doc.add_page_break()

    # Save the document
    doc.save(f'/Users/giacomoraederscheidt/Dropbox/Paper_Giacomo_Lotti/0 Text-Tables/most_representative_articles_topic_{topic_id}.docx')

    print(f"Most representative articles for topic {topic_id} saved to CSV and Word documents successfully.")


Loading model from disk...
Model loaded.
Loading data...


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)
  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 0 saved to CSV and Word documents successfully.
Most representative articles for topic 1 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 2 saved to CSV and Word documents successfully.
Most representative articles for topic 3 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)
  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 4 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 5 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 6 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 7 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 8 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 9 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 10 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 11 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 12 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)
  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 13 saved to CSV and Word documents successfully.
Most representative articles for topic 14 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 15 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 16 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 17 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 18 saved to CSV and Word documents successfully.


  most_representative_articles = most_representative_articles.append(most_representative_article, ignore_index=True)


Most representative articles for topic 19 saved to CSV and Word documents successfully.
