In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import nltk
from datasets import load_dataset
df = load_dataset("K-RLange/SpeakGer_sample")
df = df["train"].to_pandas()
df = df[df["State"] == "Nordrhein-Westfalen"]
nltk.download('stopwords')
from ttta.methods.lda_prototype import LDAPrototype
from ttta.methods.rolling_lda import RollingLDA
from ttta.methods.topical_changes import TopicalChanges
from ttta.preprocessing.preprocess import preprocess, create_dtm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalange\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Preprocess NRW SpeakerGer toy dataset

In [2]:
df["Speech"] = preprocess(df["Speech"], language="german")
df["Date"] = pd.to_datetime(df["Date"])
## make column names compliant with the method
df.rename({"Date":"date", "Speech":"text", "Party":"individual"}, axis = 1, inplace=True)
df.drop_duplicates(["date", "individual"], inplace=True)

In [82]:
df

Unnamed: 0,date,State,individual,text
121551,2022-01-26,Nordrhein-Westfalen,['spd'],"[gerne, unterschiedlichen, menschen, zusammen,..."
121554,2022-01-26,Nordrhein-Westfalen,['fdp'],"[vergleich, delta, sehen, weniger, schwere, ve..."
121582,2022-01-26,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, prsident, liebe, kolleginnen,..."
121601,2022-01-26,Nordrhein-Westfalen,['die grünen'],"[zeiten, denen, gesellschaftliche, einigung, r..."
121658,2022-01-26,Nordrhein-Westfalen,['afd'],"[frau, prsidentin, damen, herren, zwei, jahre,..."
...,...,...,...,...
132832,2022-12-21,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, prsident, verehrte, kolleginn..."
132833,2022-12-21,Nordrhein-Westfalen,['spd'],"[herr, prsident, kolleginnen, kollegen, gerade..."
132845,2022-12-21,Nordrhein-Westfalen,['die grünen'],"[herr, prsident, vielen, dank, lieber, herr, k..."
132868,2022-12-21,Nordrhein-Westfalen,['fdp'],"[herr, prsident, geehrten, damen, herren, beha..."


In [83]:
df.individual.unique()

array(["['spd']", "['fdp']", "['cdu']", "['die grünen']", "['afd']", '[]',
       "['fdp', 'spd', 'cdu']", "['fdp', 'cdu']", "['other']",
       "['spd', 'cdu']"], dtype=object)

## Simplification of text
Especially the second Poisson Reduced Rank Regression with Dependent Word Weights is computational very intensive. Thus, in the following, the text tokens are filtered for the top 100 tokens.

In [3]:
from collections import Counter
tokens = Counter(sum(df["text"], []))
top_100_counts = tokens.most_common(100)
top_100_tokens = set([x[0] for x in top_100_counts])
df.loc[:,"text"] = df["text"].apply(lambda x: [token for token in x if token in top_100_tokens])
df["len_list"] = df["text"].apply(lambda x: len(x))
df = df.loc[df["len_list"]>0]

In [5]:
df

Unnamed: 0,date,State,individual,text,len_list
121551,2022-01-26,Nordrhein-Westfalen,['spd'],"[menschen, für, viele, über, können, über, kol...",12
121554,2022-01-26,Nordrhein-Westfalen,['fdp'],"[menschen, deutlich, deshalb, ganz, deutschlan...",34
121582,2022-01-26,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, präsident, liebe, kolleginnen...",21
121601,2022-01-26,Nordrhein-Westfalen,['die grünen'],"[denen, mehr, genau, möchte, vergangenen, nrw,...",7
121658,2022-01-26,Nordrhein-Westfalen,['afd'],"[frau, präsidentin, damen, herren, zwei, jahre...",83
...,...,...,...,...,...
132832,2022-12-21,Nordrhein-Westfalen,['cdu'],"[geehrter, herr, präsident, kolleginnen, kolle...",8
132833,2022-12-21,Nordrhein-Westfalen,['spd'],"[herr, präsident, kolleginnen, kollegen, gerad...",9
132845,2022-12-21,Nordrhein-Westfalen,['die grünen'],"[herr, präsident, vielen, dank, herr, landtag]",6
132868,2022-12-21,Nordrhein-Westfalen,['fdp'],"[herr, präsident, geehrten, damen, herren, zwe...",25


### Fitting Poisson Reduced Rank Regression with Time **Independent** Word Weights

In [4]:
from ttta.methods.prr_time_independent_weights import PoissonReducedRankTimeIndependentWordWeights
model = PoissonReducedRankTimeIndependentWordWeights(K=2)
trained_params = model.fit(df,n_iter=10)

Converting Dataframe to dtm matrix...
Conversion from Dataframe to dtm successful!
Fitting a poisson Reduced Rank Model with time indep. Word Weights for 100 tokens and 340 documents


  data.set_index([individual_column, date_column])


  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
# get the params
print(model.get_params())

{'K': 2, 'alpha': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'beta': array([4., 1., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
     

In [88]:
## save the model
from pathlib import Path
model.save(Path("./test.pkl"))

## load the model
loaded_model = model.load(Path("./test.pkl"))

### Fitting Poisson Reduced Rank Regression with Time **Dependent** Word Weights 

In [8]:
from ttta.methods.prr_time_dependent_weights import PenalizedPoissonReducedRankTimeDependentWordWeights
stats = PenalizedPoissonReducedRankTimeDependentWordWeights.calculate_IJT(df)
prr_dependent_weights = PenalizedPoissonReducedRankTimeDependentWordWeights(I = stats["I"], T=stats["T"], J =stats["J"], K=2)

In [9]:
# refactor the trained params from the time independent model
prr_dependent_weights.fit(df, n_iter=10, warm_start=trained_params)

Using params from warm start...
Convert initial parameters to right dimensions
Optimize model...


  data.set_index([individual_column, date_column])


  0%|          | 0/10 [00:00<?, ?it/s]

  return -mu + np.multiply(y, np.log(mu))
  return -mu + np.multiply(y, np.log(mu))
  return -mu + np.multiply(y, np.log(mu))


PoissonReducedRankParameters(logged_dtm=array([[-0.9467907 , -0.70209646,  0.02085066, ...,  0.09877634,
         0.08023071,  0.09877634],
       [-0.14094496, -1.4929705 ,  0.2824707 , ...,  0.7031288 ,
         0.863822  ,  0.7031288 ],
       [-0.76228166,  0.6688566 , -0.23152876, ..., -0.5323806 ,
        -0.6925998 , -0.5323806 ],
       ...,
       [ 0.99868774, -1.0810666 ,  0.34453583, ...,  0.7990222 ,
         1.0330353 ,  0.7990222 ],
       [ 0.5504198 ,  0.56750345, -0.04418039, ..., -0.13642502,
        -0.14440274, -0.13642502],
       [-1.9405074 , -0.30871725, -0.18468523, ..., -0.35796642,
        -0.52903676, -0.35796642]], dtype=float32), alpha=array([-3.759965 , -4.651976 , -3.928913 , -3.8975034, -5.324263 ,
       -5.816733 , -5.110508 , -4.14675  , -4.9548   , -4.4760637,
       -5.0853376, -5.5029116, -5.139976 , -5.330044 , -3.8596213,
       -4.3910213, -4.0870004, -3.8978634, -4.317847 , -4.5477304,
       -4.210807 , -4.8043976, -6.45348  , -4.6956882, -5

In [10]:
## get the current params of the model
prr_dependent_weights.get_params()

{'K': 2,
 'alpha': array([9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
        9.950825e+23, 

In [12]:
print(3)

{'K': 2, 'alpha': array([9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.950825e+23, 9.950825e+23,
       9.950825e+23, 9.950825e+23, 9.

In [93]:
## load the model
prr_dependent_weights.save(Path("./test_prr_model.pkl"))

## loaded model
new_model = prr_dependent_weights.load(Path("./test_prr_model.pkl"))