In [1]:
import pandas as pd
import numpy as np
import numpy as np
from tqdm import tqdm
import sqlalchemy
from sqlalchemy import text
from pandarallel import pandarallel
import regex as re
pandarallel.initialize(progress_bar=True)
tqdm.pandas()

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
import pyLDAvis
from pyLDAvis import gensim_models
import gensim.corpora as corpora
import pyLDAvis.gensim_models
import os
import gensim
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:4750@192.168.0.137:5432/ba1')

def sql_read(topics,lim):
    stat= "SELECT record_id, collectiontitle_token,abstract_token,title_token,class FROM  ke_stage.ba_corpus_2 WHERE class LIKE "+str(topics)+" LIMIT " +str(lim)
    df = pd.read_sql_query(sqlalchemy.text(str(stat)),engine)
    return df

def to_data(df):
    data=[]
    for row in tqdm(df['combined'].values):
        row = row.split(",")
        data.append(row)
    return data
def to_id_corpus(data):
    # Create Dictionary
    id2word = corpora.Dictionary(data)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data]
    return corpus, id2word

def row2data(row):
    data=[]
    row = row.split(",")
    data.append(row)
    return data

def get_topic(row ,lda):
    to_pro = []
    data = row2data(row)
    corpus, id2w = to_id_corpus(data)
    
    topic = lda.get_document_topics(corpus, minimum_probability=0.5, minimum_phi_value=None,
                                   per_word_topics=False)
    for t in topic:
            to_pro.append(t)
    return(to_pro)

def count_class_pop(df):
    counted=df['class'].value_counts()
    counted = counted.to_frame()
    counted["population"] = counted['class'].values / len(df)
    counted["pop_perc"] = counted['population'].values * 100
    #ax = sns.barplot(data=counted,x="pop_perc",  y= counted.index, orient="h")
    #ax.set(xlabel="Anteil der Publikation in %",ylabel="Klasse")
    #plt.show()
    lowest_c = counted.min()['class']
    return counted , lowest_c

def combine_tokens(df):
    df['combined'] = df[df.columns[1:3]].parallel_apply(lambda x: ','.join(x.astype(str)),axis=1)
    df = df.drop(['title_token',
                'abstract_token','collectiontitle_token'
                ],axis =1 )
    df = df[df["combined"].str.len() > 3]
    return df

In [5]:
lim=5000000
size=lim
df_med = sql_read("'Medizin'",lim)                 
df_land = sql_read("'Landwirtschaft'",lim)          
df_umwelt = sql_read("'Umweltwissenschaften'",lim)     
df_ern = sql_read("'ErnÃ¤hrung'",lim)     
df = pd.concat([df_med, df_land,df_umwelt,df_ern])
df = df.replace('', np.nan)
df = df.drop(df[pd.isna(df['collectiontitle_token']) & pd.isna(df['abstract_token']) & pd.isna(df['title_token'])].index)
df = df.replace(np.nan,'')
df = combine_tokens(df)

while size > 1000:
    size = int(size)
    df_med = df.loc[df['class'] == "Medizin"].head(int(size))
    df_land = df.loc[df['class'] =='Landwirtschaft'].head(int(size))         
    df_umwelt = df.loc[df['class'] =='Umweltwissenschaften'].head(int(size))  
    df_ern = df.loc[df['class'] =='ErnÃ¤hrung'].head(int(size))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    counted, lowest_c = count_class_pop(df)
    df_med = df.loc[df['class'] == "Medizin"].head(int(lowest_c))
    df_land = df.loc[df['class'] =='Landwirtschaft'].head(int(lowest_c))         
    df_umwelt = df.loc[df['class'] =='Umweltwissenschaften'].head(int(lowest_c))  
    df_ern = df.loc[df['class'] =='ErnÃ¤hrung'].head(int(lowest_c))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    counted, lowest_c = count_class_pop(df)

    df_train, df_test = train_test_split(df, test_size=0.25)
    df_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)

    train_data = to_data(df_train)
    test_data = to_data(df_test)
    train_corpus, train_id2w = to_id_corpus(train_data)
    test_corpus, test_id2w = to_id_corpus(test_data)
    num_topics = 4
    while num_topics < 13:
        lda = gensim.models.ldamulticore.LdaMulticore(
                                corpus=train_corpus,
                                num_topics=num_topics,
                                id2word=train_id2w,
                                chunksize=1000,
                                workers=4, # Num. Processing Cores - 1
                                passes=30,
                                eval_every = 6,
                                per_word_topics=False)
        df_test["topic"] = df_test["combined"].apply(lambda x: get_topic(x , lda))
        df_test["topic"] = df_test["topic"].astype("str")
        df_test["topic"] = df_test["topic"].replace(to_replace=r'[^\d|\.|\,]', value='', regex=True)
        df_test["topic"] = df_test["topic"].replace('', np.nan)
        df_test[["topic","certainty"]] =  df_test["topic"].progress_apply(lambda x: pd.Series(str(x).split(",")))
        count_series =  df_test.groupby(['class','topic'])['class'].count()
        count_series = count_series.to_frame()
        count_series.columns = ['count']
        count_series = count_series.reset_index()
        print("Anzahl der Publiktationen pro Klasse:" ,int(lowest_c))
        ax = sns.barplot(data=count_series,x="count",  y="class", orient="h", hue="topic")
        ax.set(xlabel="Menge an Zugewiesenen Topics pro Klasse",ylabel="Klasse(Averbis) & Topic(LDA)")
        ax.text(x=0.5, y=1.1, s='Klassifzierungsgenauigkeit des LDA Models anhand des Testdatensatzes',
                fontsize=13, weight='bold',
                ha='center', va='bottom', transform=ax.transAxes)
        ax.text(x=0.5, y=1.05, s="bei einer Testdatensatzgröße von " + str(len(df_test))+ " Publikationen und "+ str(num_topics)+' "Topics"',
                fontsize=8, alpha=0.75, ha='center', va='bottom', transform=ax.transAxes)
        
        fig = ax.get_figure()
        fig.savefig("grafiken\\Klasse_zu_pub_"+str(size)+"_topics_"+str(num_topics)+".png",dpi=300, bbox_inches = "tight") 
        fig.clf()
        count_series.to_csv("lda_score_csv\\topic_population_at_"+str(size)+"_topics_"+str(num_topics)+".csv")
        num_topics = num_topics + 4

    size = size/2

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=728144), Label(value='0 / 728144')…

100%|██████████| 5161161/5161161 [04:44<00:00, 18145.89it/s]
100%|██████████| 1720387/1720387 [01:40<00:00, 17139.49it/s]
100%|██████████| 1720387/1720387 [11:20<00:00, 2527.92it/s] 


Anzahl der Publiktationen pro Klasse: 1720387


100%|██████████| 1720387/1720387 [17:20<00:00, 1652.94it/s] 


Anzahl der Publiktationen pro Klasse: 1720387
