In [1]:
import pandas as pd
import pickle
import numpy as np

import spacy
nlp = spacy.load('en_core_web_sm')

import scattertext as st


In [2]:
# # Import the different sheets.

# yrs = ["2019", "2020"]
# df = pd.DataFrame()

# for sheet in yrs:
#     temp_df = pd.read_excel("../data/2019 - 2020 JCRP Metrics -EB.xlsx", 
#                    sheet_name = sheet)
#     df = pd.concat([df, temp_df])
    
# df = df.dropna(axis=0, how="all")

# # A lot of the Title entries are "Selected Abstracts..." and are from "Literature Update" article types.
# # However, the Vol 40 doesn't have article type so have to look through titles.

# df = df[df["Title"].str.contains("Selected Abstracts From Recent Publications") == False]

In [3]:
with open("../data/dataframe_with_abstracts.pickle", "rb") as read_file:
    df = pickle.load(read_file)

# Abstracts
---

### Look at top words by popularity.

In [4]:
# Create a rank according to Twitter popularity.

df["twitter_rank"] = df.Tweets.rank(pct=True)

# Split into two datasets based on twitter rank.
df_bottom_tweets = df.query("twitter_rank < 0.33")
df_top_tweets = df.query("twitter_rank > 0.67")

In [30]:
df_twitter = pd.concat([df_bottom_tweets[["twitter_rank", "Full_Abstract"]], 
                        df_top_tweets[["twitter_rank", "Full_Abstract"]]])

df_twitter["Popularity"] = df_twitter.twitter_rank.map(lambda x: "Popular" if x>0.5 else "Less Popular")


In [31]:
# Build the corpus.

corpus = st.CorpusFromPandas(
    df_twitter,
    category_col = "Popularity",
    text_col = "Full_Abstract", 
    nlp=nlp
).build()

In [32]:
# Create the HTML.

html = st.produce_scattertext_explorer(
        corpus,
        category= "Popular", 
        category_name=  "Popular", 
        not_category_name= "Less Popular", 
        minimum_term_frequency=6,
        pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        )

In [33]:
# Explore the HTML.

open('../images/abstract_popularity_words.html', 'wb').write(html.encode('utf-8'));

### Look at top phrases between popularity.

In [34]:
# Create the corpus.

corpus = st.CorpusFromPandas(
    df_twitter,
    category_col = "Popularity",
    text_col = "Full_Abstract", 
    feats_from_spacy_doc=st.PhraseMachinePhrases(),
    nlp=nlp
).build().compact(st.AssociationCompactor(4000))

In [35]:
# Create the HTML.

html = st.produce_scattertext_explorer(
        corpus,
        category= "Popular", 
        category_name=  "Popular", 
        not_category_name= "Less Popular", 
        minimum_term_frequency=0,
        pmi_threshold_coefficient=0,
        transform=st.dense_rank,
        term_scorer=st.RankDifference(),
        width_in_pixels=1000,
        )

In [36]:
# Explore the HTML.

open('../images/abstract_popularity_phrases.html', 'wb').write(html.encode('utf-8'));

### Compare volumes - words.

In [12]:
df_vol = df[["Volume", "Full_Abstract"]]

df_vol["Vol_cat"] = df.Volume.apply(lambda x: f"{x:.0f}")

In [13]:
# Build the corpus.

corpus = st.CorpusFromPandas(
    df_vol,
    category_col = "Vol_cat",
    text_col = "Full_Abstract", 
    nlp=nlp
).build()

In [14]:
# Create the HTML.

html = st.produce_scattertext_explorer(
        corpus,
        category= "39", 
        category_name= "39", 
        not_category_name= "40", 
        minimum_term_frequency=6,
        pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        )

In [15]:
# Explore the HTML.

open('../images/abstract_volume_words.html', 'wb').write(html.encode('utf-8'));

### Compare volumes - phrases.

In [16]:
# Create the corpus.

corpus = st.CorpusFromPandas(
    df_vol,
    category_col = "Vol_cat",
    text_col = "Full_Abstract", 
    feats_from_spacy_doc=st.PhraseMachinePhrases(),
    nlp=nlp
).build().compact(st.AssociationCompactor(4000))

In [17]:
# Create the HTML.

html = st.produce_scattertext_explorer(
        corpus,
        category= "39", 
        category_name= "39", 
        not_category_name= "40", 
        minimum_term_frequency=0,
        pmi_threshold_coefficient=0,
        transform=st.dense_rank,
        term_scorer=st.RankDifference(),
        width_in_pixels=1000,
        )

In [18]:
# Explore the HTML.

open('../images/abstract_volume_phrases.html', 'wb').write(html.encode('utf-8'));

# Titles
---

### Compare volumes - Words

In [19]:
df_vol = df[["Volume", "Title"]]

df_vol["Vol_cat"] = df.Volume.apply(lambda x: f"{x:.0f}")

In [20]:
# Build the corpus.
corpus = st.CorpusFromPandas(
    df_vol,
    category_col = "Vol_cat",
    text_col = "Title", 
    nlp=nlp
).build()

# Create the HTML.
html = st.produce_scattertext_explorer(
        corpus,
        category= "39", 
        category_name= "39", 
        not_category_name= "40", 
        minimum_term_frequency=6,
        pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        )

# Explore the HTML.
open('../images/title_volume_words.html', 'wb').write(html.encode('utf-8'));

### Compare volumes - phrases.

In [21]:
# Create the corpus.
corpus = st.CorpusFromPandas(
    df_vol,
    category_col = "Vol_cat",
    text_col = "Title", 
    feats_from_spacy_doc=st.PhraseMachinePhrases(),
    nlp=nlp
).build().compact(st.AssociationCompactor(4000))

# Create the HTML.
html = st.produce_scattertext_explorer(
        corpus,
        category= "39", 
        category_name= "39", 
        not_category_name= "40", 
        minimum_term_frequency=0,
        pmi_threshold_coefficient=0,
        transform=st.dense_rank,
        term_scorer=st.RankDifference(),
        width_in_pixels=1000,
        )

# Explore the HTML.
open('../images/title_volume_phrases.html', 'wb').write(html.encode('utf-8'));

### Compare popularity - Words.

In [27]:
df_twitter = pd.concat([df_bottom_tweets[["twitter_rank", "Title"]], 
                        df_top_tweets[["twitter_rank", "Title"]]])

df_twitter["Popularity"] = df_twitter.twitter_rank.map(lambda x: "Popular" if x>0.5 else "Less Popular")


In [28]:
# Build the corpus.
corpus = st.CorpusFromPandas(
    df_twitter,
    category_col = "Popularity",
    text_col = "Title", 
    nlp=nlp
).build()

# Create the HTML.
html = st.produce_scattertext_explorer(
        corpus,
        category= "Popular", 
        category_name=  "Popular", 
        not_category_name= "Less Popular", 
        minimum_term_frequency=6,
        pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        )

# Explore the HTML.
open('../images/title_popularity_words.html', 'wb').write(html.encode('utf-8'));

### Compare popularity - Phrases.

In [29]:
# Create the corpus.
corpus = st.CorpusFromPandas(
    df_twitter,
    category_col = "Popularity",
    text_col = "Title", 
    feats_from_spacy_doc=st.PhraseMachinePhrases(),
    nlp=nlp
).build().compact(st.AssociationCompactor(4000))

# Create the HTML.
html = st.produce_scattertext_explorer(
        corpus,
        category= "Popular", 
        category_name=  "Popular", 
        not_category_name= "Less Popular", 
        minimum_term_frequency=0,
        pmi_threshold_coefficient=0,
        transform=st.dense_rank,
        term_scorer=st.RankDifference(),
        width_in_pixels=1000,
        )

# Explore the HTML.
open('../images/title_popularity_phrases.html', 'wb').write(html.encode('utf-8'));