In [15]:
import pandas as pd
import numpy as np
import numpy as np
from tqdm import tqdm
import sqlalchemy
from sqlalchemy import text
from pandarallel import pandarallel
import regex as re
pandarallel.initialize(progress_bar=False, nb_workers= 8)
tqdm.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [16]:
import pyLDAvis
from pyLDAvis import gensim_models
import gensim.corpora as corpora
import pyLDAvis.gensim_models
import os
import gensim
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:4750@192.168.0.137:5432/ba1')

def sql_read(topics,lim):
    stat= "SELECT record_id, collectiontitle_token,abstract_token,title_token,class FROM  ke_stage.ba_corpus_2 WHERE class LIKE "+str(topics)+" LIMIT " +str(lim)
    df = pd.read_sql_query(sqlalchemy.text(str(stat)),engine)
    return df

def to_data(df):
    data=[]
    for row in tqdm(df['combined'].values):
        row = row.split(",")
        data.append(row)
    return data
def to_id_corpus(data):
    # Create Dictionary
    id2word = corpora.Dictionary(data)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data]
    return corpus, id2word

def row2data(row):
    data=[]
    row = row.split(",")
    data.append(row)
    return data

def get_topic(row ,lda):
    to_pro = []
    data = row2data(row)
    corpus, id2w = to_id_corpus(data)
    
    topic = lda.get_document_topics(corpus, minimum_probability=0.5, minimum_phi_value=None,
                                   per_word_topics=False)
    for t in topic:
            to_pro.append(t)
    return(to_pro)

def count_class_pop(df):
    counted=df['class'].value_counts()
    counted = counted.to_frame()
    counted["population"] = counted['class'].values / len(df)
    counted["pop_perc"] = counted['population'].values * 100
    #ax = sns.barplot(data=counted,x="pop_perc",  y= counted.index, orient="h")
    #ax.set(xlabel="Anteil der Publikation in %",ylabel="Klasse")
    #plt.show()
    lowest_c = counted.min()['class']
    return counted , lowest_c

def combine_tokens(df):
    df['combined'] = df[df.columns[1:3]].parallel_apply(lambda x: ','.join(x.astype(str)),axis=1)
    df = df.drop(['title_token',
                'abstract_token','collectiontitle_token'
                ],axis =1 )
    df = df[df["combined"].str.len() > 3]
    return df

In [18]:
max_df_size = 1000000
min_df_size = 1000

modi = 4

while min_df_size < max_df_size:
    lim=min_df_size    
    print("loading ", int(lim)," Einträge aus der Datenbank")
    df_med = sql_read("'Medizin'",lim)                 
    df_land = sql_read("'Landwirtschaft'",lim)          
    df_umwelt = sql_read("'Umweltwissenschaften'",lim)     
    df_ern = sql_read("'ErnÃ¤hrung'",lim)     
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    df = df.replace('', np.nan)
    df = df.drop(df[pd.isna(df['collectiontitle_token']) & pd.isna(df['abstract_token']) & pd.isna(df['title_token'])].index)
    df = df.replace(np.nan,'')
    df = combine_tokens(df)

    min_df_size = int(min_df_size)
    df_med = df.loc[df['class'] == "Medizin"].head(int(min_df_size))
    df_land = df.loc[df['class'] =='Landwirtschaft'].head(int(min_df_size))         
    df_umwelt = df.loc[df['class'] =='Umweltwissenschaften'].head(int(min_df_size))  
    df_ern = df.loc[df['class'] =='ErnÃ¤hrung'].head(int(min_df_size))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    counted, lowest_c = count_class_pop(df)
    df_med = df.loc[df['class'] == "Medizin"].head(int(lowest_c))
    df_land = df.loc[df['class'] =='Landwirtschaft'].head(int(lowest_c))         
    df_umwelt = df.loc[df['class'] =='Umweltwissenschaften'].head(int(lowest_c))  
    df_ern = df.loc[df['class'] =='ErnÃ¤hrung'].head(int(lowest_c))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    counted, lowest_c = count_class_pop(df)

    df_train, df_test = train_test_split(df, test_size=0.25)
    df_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)

    train_data = to_data(df_train)
    test_data = to_data(df_test)
    train_corpus, train_id2w = to_id_corpus(train_data)
    test_corpus, test_id2w = to_id_corpus(test_data)
    num_topics = 16
    while num_topics < 25:
        print("trainiere LDA Model mit ", int(num_topics)," Topics")    
        lda = gensim.models.ldamulticore.LdaMulticore(
                                corpus=train_corpus,
                                num_topics=num_topics,
                                id2word=train_id2w,
                                chunksize=1000,
                                workers=10, # Num. Processing Cores - 1
                                passes=30,
                                eval_every = 6,
                                per_word_topics=False)
        df_test["topic"] = df_test["combined"].apply(lambda x: get_topic(x , lda))
        df_test["topic"] = df_test["topic"].astype("str")
        df_test["topic"] = df_test["topic"].replace(to_replace=r'[^\d|\.|\,]', value='', regex=True)
        df_test["topic"] = df_test["topic"].replace('', np.nan)
        df_test[["topic","certainty"]] =  df_test["topic"].progress_apply(lambda x: pd.Series(str(x).split(",")))
        count_series =  df_test.groupby(['class','topic'])['class'].count()
        count_series = count_series.to_frame()
        count_series.columns = ['count']
        count_series = count_series.reset_index()
        print("Anzahl der Publiktationen pro Klasse:" ,int(lowest_c), " und ", int( num_topics)," Topics")
        ax = sns.barplot(data=count_series,x="count",  y="class", orient="h", hue="topic")
        ax.set(xlabel="Menge an Zugewiesenen Topics pro Klasse",ylabel="Klasse(Averbis) & Topic(LDA)")
        ax.text(x=0.5, y=1.1, s='Klassifzierungsgenauigkeit des LDA Models anhand des Testdatensatzes',
                fontsize=13, weight='bold',
                ha='center', va='bottom', transform=ax.transAxes)
        ax.text(x=0.5, y=1.05, s="bei einer Testdatensatzgröße von " + str(len(df_test))+ " Publikationen und "+ str(num_topics)+' "Topics"',
                fontsize=8, alpha=0.75, ha='center', va='bottom', transform=ax.transAxes)
        
        fig = ax.get_figure()
        fig.savefig("grafiken\\Klasse_zu_pub_"+str(min_df_size)+"_topics_"+str(num_topics)+".png",dpi=300, bbox_inches = "tight") 
        fig.clf()
        count_series.to_csv("lda_score_csv\\topic_population_at_"+str(min_df_size)+"_topics_"+str(num_topics)+".csv")
        num_topics = num_topics + 4

    min_df_size = min_df_size*modi

loading  1000  Einträge aus der Datenbank


100%|██████████| 843/843 [00:00<00:00, 64841.34it/s]
100%|██████████| 281/281 [00:00<00:00, 70263.47it/s]


trainiere LDA Model mit  16  Topics


100%|██████████| 281/281 [00:00<00:00, 4844.84it/s]


Anzahl der Publiktationen pro Klasse: 281  und  16  Topics
trainiere LDA Model mit  20  Topics


100%|██████████| 281/281 [00:00<00:00, 5854.10it/s]


Anzahl der Publiktationen pro Klasse: 281  und  20  Topics
trainiere LDA Model mit  24  Topics


100%|██████████| 281/281 [00:00<00:00, 5619.95it/s]


Anzahl der Publiktationen pro Klasse: 281  und  24  Topics
loading  4000  Einträge aus der Datenbank


100%|██████████| 3888/3888 [00:00<00:00, 68210.91it/s]
100%|██████████| 1296/1296 [00:00<00:00, 86404.89it/s]


trainiere LDA Model mit  16  Topics


100%|██████████| 1296/1296 [00:00<00:00, 5837.62it/s] 


Anzahl der Publiktationen pro Klasse: 1296  und  16  Topics
trainiere LDA Model mit  20  Topics


100%|██████████| 1296/1296 [00:00<00:00, 5785.55it/s] 


Anzahl der Publiktationen pro Klasse: 1296  und  20  Topics
trainiere LDA Model mit  24  Topics


100%|██████████| 1296/1296 [00:00<00:00, 1357.06it/s]


Anzahl der Publiktationen pro Klasse: 1296  und  24  Topics
loading  16000  Einträge aus der Datenbank


100%|██████████| 16335/16335 [00:00<00:00, 49054.34it/s]
100%|██████████| 5445/5445 [00:00<00:00, 61875.46it/s]


trainiere LDA Model mit  16  Topics


100%|██████████| 5445/5445 [00:00<00:00, 5544.75it/s] 


Anzahl der Publiktationen pro Klasse: 5445  und  16  Topics
trainiere LDA Model mit  20  Topics


100%|██████████| 5445/5445 [00:01<00:00, 2775.44it/s] 


Anzahl der Publiktationen pro Klasse: 5445  und  20  Topics
trainiere LDA Model mit  24  Topics


100%|██████████| 5445/5445 [00:00<00:00, 5500.41it/s] 


Anzahl der Publiktationen pro Klasse: 5445  und  24  Topics
loading  64000  Einträge aus der Datenbank


100%|██████████| 65130/65130 [00:01<00:00, 43623.33it/s]
100%|██████████| 21710/21710 [00:01<00:00, 19753.67it/s]


trainiere LDA Model mit  16  Topics


100%|██████████| 21710/21710 [00:07<00:00, 2927.40it/s]


Anzahl der Publiktationen pro Klasse: 21710  und  16  Topics
trainiere LDA Model mit  20  Topics


100%|██████████| 21710/21710 [00:07<00:00, 2877.32it/s]


Anzahl der Publiktationen pro Klasse: 21710  und  20  Topics
trainiere LDA Model mit  24  Topics


100%|██████████| 21710/21710 [00:06<00:00, 3143.29it/s]


Anzahl der Publiktationen pro Klasse: 21710  und  24  Topics
loading  256000  Einträge aus der Datenbank


100%|██████████| 263058/263058 [00:06<00:00, 40220.61it/s]
100%|██████████| 87686/87686 [00:01<00:00, 68293.97it/s]


trainiere LDA Model mit  16  Topics


100%|██████████| 87686/87686 [00:29<00:00, 3001.38it/s] 


Anzahl der Publiktationen pro Klasse: 87686  und  16  Topics
trainiere LDA Model mit  20  Topics


100%|██████████| 87686/87686 [00:35<00:00, 2471.21it/s] 


Anzahl der Publiktationen pro Klasse: 87686  und  20  Topics
trainiere LDA Model mit  24  Topics


100%|██████████| 87686/87686 [00:34<00:00, 2518.41it/s] 


Anzahl der Publiktationen pro Klasse: 87686  und  24  Topics


<Figure size 640x480 with 0 Axes>