In [11]:
import pandas as pd
import numpy as np
import numpy as np
from tqdm import tqdm
import sqlalchemy
from sqlalchemy import text
from pandarallel import pandarallel
import regex as re
pandarallel.initialize(progress_bar=False, nb_workers= 8)
tqdm.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [12]:
import pyLDAvis
from pyLDAvis import gensim_models
import gensim.corpora as corpora
import pyLDAvis.gensim_models
import os
import gensim
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:4750@192.168.0.137:5432/ba1')

def sql_read(topics,lim):
    stat= "SELECT record_id, collectiontitle_token,abstract_token,title_token,class FROM  ke_stage.ba_corpus_2 WHERE class LIKE "+str(topics)+" LIMIT " +str(lim)
    df = pd.read_sql_query(sqlalchemy.text(str(stat)),engine)
    return df

def to_data(df):
    data=[]
    for row in tqdm(df['combined'].values):
        row = row.split(",")
        data.append(row)
    return data
def to_id_corpus(data):
    # Create Dictionary
    id2word = corpora.Dictionary(data)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data]
    return corpus, id2word

def row2data(row):
    data=[]
    row = row.split(",")
    data.append(row)
    return data

def get_topic(row ,lda):
    to_pro = []
    data = row2data(row)
    corpus, id2w = to_id_corpus(data)
    
    topic = lda.get_document_topics(corpus, minimum_probability=0.5, minimum_phi_value=None,
                                   per_word_topics=False)
    for t in topic:
            to_pro.append(t)
    return(to_pro)

def count_class_pop(df):
    counted=df['class'].value_counts()
    counted = counted.to_frame()
    counted["population"] = counted['class'].values / len(df)
    counted["pop_perc"] = counted['population'].values * 100
    #ax = sns.barplot(data=counted,x="pop_perc",  y= counted.index, orient="h")
    #ax.set(xlabel="Anteil der Publikation in %",ylabel="Klasse")
    #plt.show()
    lowest_c = counted.min()['class']
    return counted , lowest_c

def combine_tokens(df):
    df['combined'] = df[df.columns[1:3]].parallel_apply(lambda x: ','.join(x.astype(str)),axis=1)
    df = df.drop(['title_token',
                'abstract_token','collectiontitle_token'
                ],axis =1 )
    df = df[df["combined"].str.len() > 3]
    return df

In [15]:
max_df_size = 50000
min_df_size = 100

modi = 4

while min_df_size < max_df_size:
    lim=min_df_size    
    print("loading ", int(lim)," Einträge aus der Datenbank")
    df_med = sql_read("'Medizin'",lim)                 
    df_land = sql_read("'Landwirtschaft'",lim)          
    df_umwelt = sql_read("'Umweltwissenschaften'",lim)     
    df_ern = sql_read("'ErnÃ¤hrung'",lim)     
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    df = df.replace('', np.nan)
    df = df.drop(df[pd.isna(df['collectiontitle_token']) & pd.isna(df['abstract_token']) & pd.isna(df['title_token'])].index)
    df = df.replace(np.nan,'')
    df = combine_tokens(df)

    min_df_size = int(min_df_size)
    df_med = df.loc[df['class'] == "Medizin"].head(int(min_df_size))
    df_land = df.loc[df['class'] =='Landwirtschaft'].head(int(min_df_size))         
    df_umwelt = df.loc[df['class'] =='Umweltwissenschaften'].head(int(min_df_size))  
    df_ern = df.loc[df['class'] =='ErnÃ¤hrung'].head(int(min_df_size))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    counted, lowest_c = count_class_pop(df)
    df_med = df.loc[df['class'] == "Medizin"].head(int(lowest_c))
    df_land = df.loc[df['class'] =='Landwirtschaft'].head(int(lowest_c))         
    df_umwelt = df.loc[df['class'] =='Umweltwissenschaften'].head(int(lowest_c))  
    df_ern = df.loc[df['class'] =='ErnÃ¤hrung'].head(int(lowest_c))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    counted, lowest_c = count_class_pop(df)

    df_train, df_test = train_test_split(df, test_size=0.25)
    df_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)

    train_data = to_data(df_train)
    test_data = to_data(df_test)
    train_corpus, train_id2w = to_id_corpus(train_data)
    test_corpus, test_id2w = to_id_corpus(test_data)
    num_topics = 4
    while num_topics < 5:
        print("trainiere LDA Model mit ", int(num_topics)," Topics")    
        lda = gensim.models.ldamulticore.LdaMulticore(
                                corpus=train_corpus,
                                num_topics=num_topics,
                                id2word=train_id2w,
                                chunksize=1000,
                                workers=10, # Num. Processing Cores - 1
                                passes=30,
                                eval_every = 6,
                                per_word_topics=False)
        df_test["topic"] = df_test["combined"].apply(lambda x: get_topic(x , lda))
        df_test["topic"] = df_test["topic"].astype("str")
        df_test["topic"] = df_test["topic"].replace(to_replace=r'[^\d|\.|\,]', value='', regex=True)
        df_test["topic"] = df_test["topic"].replace('', np.nan)
        df_test[["topic","certainty"]] =  df_test["topic"].progress_apply(lambda x: pd.Series(str(x).split(",")))
        df_test.to_csv("scores_csv\\Klasse_zu_pub_"+str(min_df_size)+"_topics_"+str(num_topics)+".csv")
        df_topic_terms = pd.DataFrame(lda.print_topics())
        df_topic_terms.to_csv("scores_csv\\terms_zu_pub_"+str(min_df_size)+"_topics_"+str(num_topics)+".csv")
        #count_series =  df_test.groupby(['class','topic'])['class'].count()
        #count_series = count_series.to_frame()
        #count_series.columns = ['count']
        #count_series = count_series.reset_index()
        #r_classes = df_test['class'].value_counts()
        #print("Anzahl der Publiktationen pro Klasse:" ,int(lowest_c), " und ", int( num_topics)," Topics")
        #ax = sns.barplot(data=count_series,x="count",  y="class", orient="h", hue="topic")
        #ax.set(xlabel="Menge an Zugewiesenen Topics pro Klasse",ylabel="Klasse(Averbis) & Topic(LDA)")
        #ax.text(x=0.5, y=1.1, s='Klassifzierungsgenauigkeit des LDA Models anhand des Testdatensatzes',
        #        fontsize=13, weight='bold',
        #        ha='center', va='bottom', transform=ax.transAxes)
        #ax.text(x=0.5, y=1.05, s="bei einer Testdatensatzgröße von " + str(len(df_test))+ " Publikationen und "+ str(num_topics)+' "Topics"',
        #        fontsize=8, alpha=0.75, ha='center', va='bottom', transform=ax.transAxes)
       # 
        #fig = ax.get_figure()
        #fig.savefig("grafiken\\Klasse_zu_pub_"+str(min_df_size)+"_topics_"+str(num_topics)+".png",dpi=300, bbox_inches = "tight") 
        #fig.clf()
        #r_classes.to_csv("lda_score_csv_test\\classes_at_"+str(min_df_size)+"_topics_"+str(num_topics)+".csv")
        num_topics = num_topics + 4

    min_df_size = min_df_size*modi

loading  100  Einträge aus der Datenbank


100%|██████████| 21/21 [00:00<?, ?it/s]
100%|██████████| 7/7 [00:00<?, ?it/s]


trainiere LDA Model mit  4  Topics


100%|██████████| 7/7 [00:00<00:00, 6990.51it/s]


loading  400  Einträge aus der Datenbank


100%|██████████| 300/300 [00:00<00:00, 199728.76it/s]
100%|██████████| 100/100 [00:00<?, ?it/s]


trainiere LDA Model mit  4  Topics


100%|██████████| 100/100 [00:00<00:00, 14293.57it/s]


loading  1600  Einträge aus der Datenbank


100%|██████████| 1599/1599 [00:00<00:00, 200421.12it/s]
100%|██████████| 533/533 [00:00<00:00, 264469.90it/s]


trainiere LDA Model mit  4  Topics


100%|██████████| 533/533 [00:00<00:00, 14235.91it/s]


loading  6400  Einträge aus der Datenbank


100%|██████████| 6471/6471 [00:00<00:00, 149602.54it/s]
100%|██████████| 2157/2157 [00:00<00:00, 190802.97it/s]


trainiere LDA Model mit  4  Topics


100%|██████████| 2157/2157 [00:00<00:00, 12113.29it/s]


loading  25600  Einträge aus der Datenbank


100%|██████████| 26400/26400 [00:00<00:00, 137630.46it/s]
100%|██████████| 8800/8800 [00:00<00:00, 131552.24it/s]


trainiere LDA Model mit  4  Topics


100%|██████████| 8800/8800 [00:00<00:00, 10867.43it/s]


[(0,
  '0.009*"" + 0.006*"water" + 0.006*"soil" + 0.006*"specie" + 0.005*"wa" + 0.004*"plant" + 0.004*"area" + 0.003*"study" + 0.003*"model" + 0.003*"result"'),
 (1,
  '0.016*"wa" + 0.011*"patient" + 0.009*"study" + 0.008*"" + 0.004*"risk" + 0.004*"group" + 0.004*"xa" + 0.004*"disease" + 0.004*"data" + 0.003*"treatment"'),
 (2,
  '0.016*"cell" + 0.011*"gene" + 0.009*"" + 0.008*"expression" + 0.007*"protein" + 0.005*"dna" + 0.004*"wa" + 0.004*"cells" + 0.004*"sequence" + 0.003*"cancer"'),
 (3,
  '0.020*"wa" + 0.009*"" + 0.006*"effect" + 0.005*"concentration" + 0.005*"study" + 0.004*"xa" + 0.004*"acid" + 0.004*"activity" + 0.004*"level" + 0.004*"result"')]