In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

import altair as alt

data5_palette = ["#DE6449", "#00D856", "#66999B", "#52D1DC", "#F3A712"]
BG = '#0c0c0c'
FG = "#eadeda"
color_scheme = alt.Scale(range=data5_palette)

FEATS_PATH = "/content/drive/Shareddrives/progettone_gruppo_5/csv/topics_csv3/cleaned_lang_topics.csv"
FULL_PATH = "/content/drive/Shareddrives/progettone_gruppo_5/csv/combined_df_final.csv"
WORDS_PATH = "/content/drive/Shareddrives/progettone_gruppo_5/csv/topics_csv3/topics_words_ita.csv"

In [3]:
def preprocess_dfs(feats_path: str, 
                   full_path: str, 
                   words_path:str, 
                   language: str = "Italian"):
    df = pd.read_csv(feats_path, sep=";")
    df = df[df["lang"] == language]
    complete = df
    complete["date_chart"] = pd.to_datetime(complete["date_chart"], 
                                            format="%d.%m.%Y")
    
    resampled_complete = complete.set_index("date_chart")
    grouped = resampled_complete.groupby([pd.Grouper(freq='M'), 'topic']) \
                                .size().reset_index()
    grouped = grouped.rename(columns={
        0:"number"
    })
    total = resampled_complete.resample('M').size().reset_index()
    total = pd.merge(total, grouped, on='date_chart', suffixes=['', '_tot'])
    total = total.rename(columns={
        0:"total"
    })
    total["proportion"] = total["number"] / total["total"]
    lda_out = pd.read_csv(words_path, sep=";")
    lda_out["topic"] = lda_out["Category"]
    lda_out["F_T"] = lda_out["Freq"] / lda_out["Total"]

    return total, lda_out

In [4]:
total, lda_out = preprocess_dfs(FEATS_PATH, FULL_PATH, WORDS_PATH)

In [5]:
selection = alt.selection_multi(
    fields=['topic'], 
    bind='legend',
    init=[{"topic": "amore"}]
) #selettore

lines = alt.Chart(total).mark_line(
    strokeWidth=3.5
).encode(
    x=alt.X("yearmonth(date_chart):T", 
            title=None, 
            axis=alt.Axis(format="%Y")),
    y=alt.Y("proportion:Q", title="Presenza media dei topic"),
    color=alt.Color("topic:N", scale=color_scheme, 
                    legend=alt.Legend(title="Topic italiani")),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
).add_selection(
    selection
).properties(
    title="Presenza dei topic nel tempo",
    width=600,
    height=200,
)

bars = alt.Chart(lda_out).mark_bar(
    tooltip=True,
    color=FG,
    opacity=0.9
).encode(
    x=alt.X('mean(F_T):Q', 
            title="Frequenza della parola nel topic", 
            scale=alt.Scale(domain=(0, 1))),
    y=alt.Y('Term:N', sort="-x", title=None),
).transform_filter(
    selection
).transform_window(
    rank='rank(Freq)',
    sort=[alt.SortField('Freq', order='descending')]
).transform_filter(
    (alt.datum.rank < 10)
).properties(
    title="Parole più frequenti",
    width=200,
    height=200,
)


concat = alt.hconcat(
    lines,
    bars, 
    center=True
).resolve_legend(
    color="independent",
    size="independent"
).configure(
    background=BG
).configure_axis(
    labelColor=FG,
    gridOpacity=0.2,
    titleColor=FG,
).configure_legend(
    padding=10,
    cornerRadius=10,
    labelColor=FG,
    titleColor=FG,
    titleFontWeight="bold",
    columns=1,
).configure_title(
    color=FG
).configure_view(
    strokeOpacity=0
).properties(
    autosize=alt.AutoSizeParams(resize=True),
)

concat.save('/content/drive/Shareddrives/progettone_gruppo_5/'
            'grafici per sito/lda_ITA.html', embed_options={'actions': False})
concat

In [6]:
WORDS_PATH = "/content/drive/Shareddrives/progettone_gruppo_5/csv/topics_csv2/topics_words_eng.csv"
total, lda_out = preprocess_dfs(FEATS_PATH, 
                                FULL_PATH, 
                                WORDS_PATH, 
                                language="English")

In [7]:
selection = alt.selection_multi(
    fields=['topic'], 
    bind='legend',
    init=[{"topic": "amore"}]
) #selettore

lines = alt.Chart(total).mark_line(
    strokeWidth=3.5
).encode(
    x=alt.X("yearmonth(date_chart):T", 
            title=None, 
            axis=alt.Axis(format="%Y")),
    y=alt.Y("proportion:Q", title="Presenza media dei topic"),
    color=alt.Color("topic:N", scale=color_scheme, 
                    legend=alt.Legend(title="Topic inglesi")),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.1))
).add_selection(
    selection
).properties(
    title="Presenza dei topic nel tempo",
    width=600,
    height=200,
)

bars = alt.Chart(lda_out).mark_bar(
    tooltip=True,
    color=FG,
    opacity=0.9
).encode(
    x=alt.X('mean(F_T):Q', title="Frequenza della parola nel topic"),
    y=alt.Y('Term:N', sort="-x", title=None),
).transform_filter(
    selection
).transform_window(
    rank='rank(Freq)',
    sort=[alt.SortField('Freq', order='descending')]
).transform_filter(
    (alt.datum.rank < 10)
).properties(
    title="Parole più frequenti",
    width=200,
    height=200,
)


concat = alt.hconcat(lines,bars).resolve_legend(
    color="independent",
    size="independent"
).configure(
    background=BG
).configure_axis(
    labelColor=FG,
    gridOpacity=0.2,
    titleColor=FG
).configure_legend(
    padding=10,
    cornerRadius=10,
    labelColor=FG,
    titleColor=FG,
    titleFontWeight="bold",
    columns=1,
).configure_title(
    color=FG
).configure_view(
    strokeOpacity=0
).properties(
    autosize=alt.AutoSizeParams(resize=True)
)

concat.save('/content/drive/Shareddrives/progettone_gruppo_5/'
            'grafici per sito/lda_ENG.html', embed_options={'actions': False})
concat