In [1]:
"""
pip install scattertext
pip install astropy
pip install empath
pip install flashtext
pip install gensim
pip install umap-learn
pip install -U
pip setuptools wheel
pip install -U spacy
python3 -m spacy download en_core_web_sm
"""

'\npip install scattertext\npip install astropy\npip install empath\npip install flashtext\npip install gensim\npip install umap-learn\npip install -U pip setuptools wheel\npip install -U spacy\npython3 -m spacy download en_core_web_sm\n'

In [2]:
import scattertext as st
import spacy
import nltk
import os
import pandas as pd

nlp = spacy.load("en_core_web_sm")

In [3]:
def get_articles_by_directory(directory):
    articles = []
    article_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                try:
                    with open(os.path.join(root, file), "r", errors="ignore") as f:
                        articles.append(f.read())
                        article_paths.append(os.path.join(root, file))
                except Exception as e:
                    print(f"Error reading file: {os.path.join(root, file)}\n{e}")
                    continue
    print(f"Found {len(articles)} articles in {directory}.")
    return articles, article_paths

def tokenize(articles):
    tokenized_articles = []
    for article in articles:
        try:
            tokenized_articles.append(nltk.sent_tokenize(article))
        except Exception as e:
            print(f"Tokenization error:\n{e}")
            continue
    print(f"Tokenized {len(articles)} articles.")
    return tokenized_articles

def get_type(path):
    if "Presse" in path:
        return "Presse"
    elif "NGO" in path:
        return "NGO"
    elif "IGO" in path:
        return "IGO"
    assert False, f"Could not determine type of article: {path}"

def get_org(path):
    path = path.split("/")[-1] # remove everything before the last slash
    path = path.split("-")[0] # remove everything after the first hyphen
    return path



In [4]:
articles, article_paths = get_articles_by_directory("COP/articles/by_org/")
articles = tokenize(articles)
articles = [" ".join(article) for article in articles]

types = [get_type(path) for path in article_paths]
orgs = [get_org(path) for path in article_paths]
articles_df = pd.DataFrame({"article": articles, "path": article_paths, "type": types, "org": orgs})
print(articles_df.head())

Found 1561 articles in COP/articles/by_org/.
Tokenized 1561 articles.
                                             article  \
0  A report of Working Group I of the Intergovern...   
1  Climate Change 2007: Synthesis Report Summary ...   
2  Contribution of Working Group II to the Fourth...   
3  Contribution of Working Group III to the Fourt...   
4   Investing in REDD-plus Consensus Recommendati...   

                                                path type   org  
0  COP/articles/by_org/IGO/IGO_COP15/IPCC-2007-1.txt  IGO  IPCC  
1  COP/articles/by_org/IGO/IGO_COP15/IPCC-2007-2.txt  IGO  IPCC  
2  COP/articles/by_org/IGO/IGO_COP15/IPCC-2007-3.txt  IGO  IPCC  
3  COP/articles/by_org/IGO/IGO_COP15/IPCC-2007-4.txt  IGO  IPCC  
4  COP/articles/by_org/IGO/IGO_COP15/REDD-2010-3.txt  IGO  REDD  


In [6]:
corpus = st.CorpusFromPandas(
    articles_df,
    category_col="type",
    text_col="article",
    nlp=nlp
).build()


In [7]:
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

['climate', 'emissions', 'warming', 'fossil', 'adaptation', 'renewable', 'carbon', 'ghg', 'mitigation', 'obama']


In [8]:
dispersion = st.Dispersion(corpus)
dispersion_df = dispersion.get_df()
"""
dispersion_df = dispersion_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df["Rosengren's S"],
    Ypos=lambda df: st.Scalers.scale(df.Y),
)
"""
dispersion_df['X'] = dispersion_df['Frequency']
dispersion_df['Xpos'] = st.Scalers.log_scale(dispersion_df['X'])
dispersion_df['Y'] = dispersion_df["Rosengren's S"]
dispersion_df['Ypos'] = st.Scalers.scale(dispersion_df['Y'])
dispersion_df.head(3)

: 

In [None]:
html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()["org"]
    + " ("
    + corpus.get_df()["type"].str.upper()
    + ")",
    ignore_categories=True,
    x_label="Log Frequency",
    y_label="Rosengren's S",
    y_axis_labels=["Less Dispersion", "Medium", "More Dispersion"],
    #term_description_column=["Frequency"]
)
with open("Test.html", "w") as f:
    f.write(html.encode("utf-8"))


In [None]:
# todo filter out stopwords
# ? factor out correlation
from sklearn.neighbors import KNeighborsRegressor

dispersion_df = dispersion_df.assign(
    Expected=lambda df: KNeighborsRegressor(n_neighbors=10).fit(
        df.X.values.reshape(-1, 1), df.Y
    ).predict(df.X.values.reshape(-1, 1)),
    Residual=lambda df: df.Y - df.Expected,
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual)
)   

html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['org'] + ' (' + corpus.get_df()['type'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label="Rosengren's S",
    y_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
    color_score_column='ColorScore',
    header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
    left_list_column='Residual',
    background_color='#e5e5e3'
)
with open("Test2.html", "w") as f:
    f.write(html.encode("utf-8"))