In [79]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [80]:
import pandas as pd
import nltk
from datasets import load_dataset
df = load_dataset("K-RLange/SpeakGer_sample")
df = df["train"].to_pandas()
df = df[df["State"] == "Nordrhein-Westfalen"]
nltk.download('stopwords')
from ttta.methods.lda_prototype import LDAPrototype
from ttta.methods.rolling_lda import RollingLDA
from ttta.methods.topical_changes import TopicalChanges
from ttta.preprocessing.preprocess import preprocess, create_dtm

Using custom data configuration K-RLange--SpeakGer_sample-e587677b140360d9
Found cached dataset csv (C:/Users/LarsG/.cache/huggingface/datasets/K-RLange___csv/K-RLange--SpeakGer_sample-e587677b140360d9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LarsG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Preprocess NRW SpeakerGer toy dataset

In [81]:
df["Speech"] = preprocess(df["Speech"], language="german")
df["Date"] = pd.to_datetime(df["Date"])
## make column names compliant with the method
df.rename({"Date":"date", "Speech":"text", "Party":"individual"}, axis = 1, inplace=True)
df.drop_duplicates(["date", "individual"], inplace=True)

In [82]:
df

Unnamed: 0,date,State,individual,text
121551,2022-01-26,Nordrhein-Westfalen,['spd'],"[gerne, unterschiedlichen, menschen, zusammen,..."
121554,2022-01-26,Nordrhein-Westfalen,['fdp'],"[vergleich, delta, sehen, weniger, schwere, ve..."
121582,2022-01-26,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, prsident, liebe, kolleginnen,..."
121601,2022-01-26,Nordrhein-Westfalen,['die grünen'],"[zeiten, denen, gesellschaftliche, einigung, r..."
121658,2022-01-26,Nordrhein-Westfalen,['afd'],"[frau, prsidentin, damen, herren, zwei, jahre,..."
...,...,...,...,...
132832,2022-12-21,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, prsident, verehrte, kolleginn..."
132833,2022-12-21,Nordrhein-Westfalen,['spd'],"[herr, prsident, kolleginnen, kollegen, gerade..."
132845,2022-12-21,Nordrhein-Westfalen,['die grünen'],"[herr, prsident, vielen, dank, lieber, herr, k..."
132868,2022-12-21,Nordrhein-Westfalen,['fdp'],"[herr, prsident, geehrten, damen, herren, beha..."


In [83]:
df.individual.unique()

array(["['spd']", "['fdp']", "['cdu']", "['die grünen']", "['afd']", '[]',
       "['fdp', 'spd', 'cdu']", "['fdp', 'cdu']", "['other']",
       "['spd', 'cdu']"], dtype=object)

## Simplification of text
Especially the second Poisson Reduced Rank Regression with Dependent Word Weights is computational very intensive. Thus, in the following, the text tokens are filtered for the top 100 tokens.

In [84]:
from collections import Counter
tokens = Counter(sum(df["text"], []))
top_100_counts = tokens.most_common(100)
top_100_tokens = set([x[0] for x in top_100_counts])
df.loc[:,"text"] = df["text"].apply(lambda x: [token for token in x if token in top_100_tokens])
df["len_list"] = df["text"].apply(lambda x: len(x))
df = df.loc[df["len_list"]>0]

In [85]:
df

Unnamed: 0,date,State,individual,text,len_list
121551,2022-01-26,Nordrhein-Westfalen,['spd'],"[menschen, viele, kollegen, viele, landtag, no...",8
121554,2022-01-26,Nordrhein-Westfalen,['fdp'],"[menschen, deutlich, deshalb, ganz, deutschlan...",27
121582,2022-01-26,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, prsident, liebe, kolleginnen,...",18
121601,2022-01-26,Nordrhein-Westfalen,['die grünen'],"[denen, mehr, debatte, eigentlich, genau, mcht...",9
121658,2022-01-26,Nordrhein-Westfalen,['afd'],"[frau, prsidentin, damen, herren, zwei, jahre,...",79
...,...,...,...,...,...
132832,2022-12-21,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, prsident, kolleginnen, kolleg...",9
132833,2022-12-21,Nordrhein-Westfalen,['spd'],"[herr, prsident, kolleginnen, kollegen, gerade...",8
132845,2022-12-21,Nordrhein-Westfalen,['die grünen'],"[herr, prsident, vielen, dank, herr, landtag]",6
132868,2022-12-21,Nordrhein-Westfalen,['fdp'],"[herr, prsident, geehrten, damen, herren, zwei...",23


### Fitting Poisson Reduced Rank Regression with Time **Independent** Word Weights

In [86]:
from ttta.methods.prr_time_independent_weights import PoissonReducedRankTimeIndependentWordWeights
model = PoissonReducedRankTimeIndependentWordWeights(K=2)
trained_params = model.fit(df,n_iter=1)

Converting Dataframe to dtm matrix...
Conversion from Dataframe to dtm successful!
Fitting a poisson Reduced Rank Model with time indep. Word Weights for 100 tokens and 340 documents


  0%|          | 0/1 [00:00<?, ?it/s]

In [87]:
# get the params
model.get_params()

{'K': 2,
 'alpha': array([1.12446903e-311, 1.12446903e-311, 1.12446903e-311, 1.12446903e-311,
        1.12446903e-311, 1.12446903e-311, 1.12446903e-311, 1.12446903e-311,
        1.12446903e-311, 1.12446903e-311, 1.12446903e-311, 1.12446903e-311,
        1.12446903e-311, 1.12446903e-311, 1.12446903e-311, 1.12446903e-311,
        1.12446903e-311, 1.12446903e-311, 1.12446903e-311, 1.12446903e-311,
        1.12446903e-311, 1.12446902e-311, 1.12449695e-311, 1.12446902e-311,
        1.12446902e-311, 1.12446902e-311, 1.12446902e-311, 1.12446902e-311,
        1.12446902e-311, 1.12446902e-311, 1.12446902e-311, 1.12446902e-311,
        1.12446900e-311, 1.12446900e-311, 1.12446900e-311, 1.12446900e-311,
        1.12446900e-311, 1.12446900e-311, 1.12446900e-311, 1.12446900e-311,
        1.12446900e-311, 1.12446900e-311, 1.12446900e-311, 1.12446900e-311,
        1.12446900e-311, 1.12446900e-311, 1.12446900e-311, 1.12446900e-311,
        1.12446900e-311, 1.12446900e-311, 1.12446900e-311, 1.12446900e

In [88]:
## save the model
from pathlib import Path
model.save(Path("./test.pkl"))

## load the model
loaded_model = model.load(Path("./test.pkl"))

### Fitting Poisson Reduced Rank Regression with Time **Dependent** Word Weights 

In [89]:
from ttta.methods.prr_time_dependent_weights import PenalizedPoissonReducedRankTimeDependentWordWeights
stats = PenalizedPoissonReducedRankTimeDependentWordWeights.calculate_IJT(df)
prr_dependent_weights = PenalizedPoissonReducedRankTimeDependentWordWeights(I = stats["I"], T=stats["T"], J =stats["J"], K=2)

In [90]:
# refactor the trained params from the time independent model
prr_dependent_weights.fit(df, n_iter=10, warm_start=trained_params)

Using params from warm start...
Convert initial parameters to right dimensions
Optimize model...


  0%|          | 0/10 [00:00<?, ?it/s]

  return -mu + np.multiply(y, np.log(mu))
  return -mu + np.multiply(y, np.log(mu))
  return -mu + np.multiply(y, np.log(mu))


PoissonReducedRankParameters(logged_dtm=array([[ 0.46000266,  0.07257485, -0.21197128, ..., -0.0569241 ,
        -0.0543828 , -0.0569241 ],
       [-0.45441866, -0.9127238 , -0.09496784, ...,  0.7383456 ,
         0.7855644 ,  0.7383456 ],
       [ 0.86608267,  0.35410333, -0.32039666, ..., -0.2835467 ,
        -0.2916205 , -0.2835467 ],
       ...,
       [ 0.4934659 , -0.40700746, -0.40286064, ...,  0.33218026,
         0.36357498,  0.33218026],
       [-2.9176326 , -0.00743532,  1.5083549 , ..., -0.00625968,
        -0.04915094, -0.00625968],
       [-2.319788  ,  0.05505896,  1.2213471 , ..., -0.05442691,
        -0.092134  , -0.05442691]], dtype=float32), alpha=array([-2.3692477, -3.3653865, -2.2528574, -2.2391005, -2.7031195,
       -3.4749622, -3.5238266, -2.397674 , -3.4181385, -2.4647996,
       -3.4632335, -2.9383802, -2.896027 , -3.4277892, -2.2311873,
       -2.3783696, -2.3894765, -2.1656716, -2.6064363, -2.606798 ,
       -2.3009756, -2.4545097, -2.6238482, -3.3702805, -2

In [91]:
## get the current params of the model
prr_dependent_weights.get_params()

{'K': 2,
 'alpha': array([-1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444e+24,
        -1.09015444e+24, -1.09015444e+24, -1.09015444e+24, -1.09015444

In [93]:
## load the model
prr_dependent_weights.save(Path("./test_prr_model.pkl"))

## loaded model
new_model = prr_dependent_weights.load(Path("./test_prr_model.pkl"))