In [1]:
import pandas as pd

import MyLib.nlp as nlp
import MyLib.HTML_prep as HTML_prep
import MyLib.PDF_prep as PDF_prep
import MyLib.analysis as analysis 
%load_ext autoreload
%autoreload 2

from numpy import nan
from tqdm import tqdm
tqdm.pandas()

## My API key  & library_Id are stored in another file
api_key,library_id = pd.read_json("Zotero_API_key.json", typ='series')

library_type="group"

def print_time():
    from datetime import datetime
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

def d(l):
    pd.set_option('display.max_colwidth', l)



from pyzotero import zotero
zot = zotero.Zotero(library_id, library_type, api_key)

ID_Universities='EWSYI3RS' # to access the files in the university subfolder.
ID_SURF='WPXRQVIU'

In [2]:
helpDict={}

for i in zot.collections_sub(ID_Universities):
    name,key=i["data"]["name"],i["data"]["key"]
    print(f"name: {name} --> key: {key}")
    collection_items=zot.collection_items(key)
    helpDict.update({k["key"]:k["data"]|{"Uni":name} for k in collection_items})
    
#Add Surf
name="surf"
key=ID_SURF
print(f"name: {name} --> key: {key}")
collection_items=zot.collection_items(key)
helpDict.update({k["key"]:k["data"]|{"Uni":name} for k in collection_items})
df=pd.DataFrame(helpDict).T

print(len(df))

name: Radboud University --> key: 5JQEUPHJ
name: Leiden University --> key: P483KEM8
name: OpenUniversiteitNederland --> key: X4HX8ZQE
name: Wageningen University --> key: QMTMEV7G
name: University of Twente --> key: QFGTICY2
name: Utrecht University --> key: 9A2UJIVA
name: Rotterdam University --> key: Z4EJ3VVG
name: University of Groningen --> key: P63XN58M
name: Vrije Universiteit Amsterdam --> key: AZJ9ZEBZ
name: Tilburg University --> key: XF9572P7
name: Eindhoven University of Technoloy --> key: FKNPJ5UD
name: Maastricht Univerisity --> key: QW34VSNN
name: Delft University --> key: FZDYP465
name: University of Amsterdam --> key: 92FECCMX
name: surf --> key: WPXRQVIU
1236


In [None]:
Claimed_Parents=df["parentItem"].dropna().to_list()
Real_Parents=df.index.to_list()
WithoutParents=list(set(Claimed_Parents)-set(Real_Parents))
WithoutParents

In [None]:
Parents_columns=['title', 'date','itemType','language']
WithParents_columns=["key",'parentItem','url','filename',"Uni"]

WithParents=df.dropna(subset="parentItem")[WithParents_columns]
Parents=df[df.itemType.apply(lambda x: x!="attachment")][Parents_columns]

df=WithParents.merge(Parents, left_on="parentItem",right_index=True, how="right")
df.head(5)

#df2[df2.filename.isna()].itemType

In [None]:
df.Uni.value_counts()

In [None]:
# Correct item type of uTwente Thesies
df.loc[df.url.apply(lambda x: "essay.utwente.nl" in str(x)),"itemType"]="thesis"
print(df.itemType.value_counts())

In [None]:
# remove not useful items
DropTypeList=["journalArticle","conferencePaper","thesis","book","bookSection","note","film","dataset"]

df=df[~df["itemType"].isin(DropTypeList)]

print(df.itemType.value_counts())

In [None]:
def fileName(key,filename):
    DIR=f"C:\\Users\\mr\\Zotero\\storage\\{key}\\"
    filename=DIR+str(filename)
    return filename

df["filepath"]=df.apply(lambda x: fileName(x.key, x.filename), axis=1).drop_duplicates()

print(df.filepath[0],print(len(df)))

In [None]:
df.to_json("2023-09-11_Uni_Files_raw.json")
# CHeck for duplicates / errors in Zotero: 
df[df.index.duplicated()]

In [None]:
df=pd.read_json("2023-09-11_Uni_Files_raw.json")

# Add Content from PDFs

In [None]:
# this is used to improve single pdf imports

##paragraph=PDF_prep.extract_text_with_pyPDF(df.filepath[4914],MaxPages=150)

In [None]:
PDFs=df.loc[df.filepath.apply(lambda x: x.split(".")[-1]=="pdf"),["filepath","date"]]    
NumberOfPDFs=len(PDFs)    

print(f"the dataset containts {NumberOfPDFs} PDF-files.")

df[["text","links","mod_date"]]=PDFs.filepath.progress_apply(PDF_prep.extract_text_with_pyPDF,MaxPages=150)

# Use filemod as date.
df.loc[~df.mod_date.isna(),["date"]]=df.mod_date
df.drop(columns='mod_date',inplace=True)

#problemfile.filepath.apply(extract_text_with_pyPDF,MaxPages=10)

# Add Content from HTMLs

In [None]:
print_time() 

print("This takes about: 3 minutes.")

df["date"]=df.date.apply(pd.to_datetime,errors='coerce')
df["date"]=df.progress_apply(lambda x: HTML_prep.find_date(x.url, x.date), axis=1)
print_time()

df.date.apply(lambda x: type(x)==pd.Timestamp).value_counts()

In [None]:
df.to_json("2023-09-12_Uni_Files_raw_date.json")

In [None]:
df=pd.read_json("2023-09-12_Uni_Files_raw_date.json")

In [None]:
from nltk import sent_tokenize
sent_tokenize("Heute gehe ich heim, Dr. Klaas nla")

In [None]:
###USE NLTK or Spacy for a better tokanizer

def extract_HTML(FilePath):
    
    Title,h1,h2,h3,text,linkName,linkUrl=None,None,None,None,None,None,None
    
    if FilePath.endswith(".pdf"):
        return pd.Series([Title,h1,h2,h3,text,linkName,linkUrl])
    
    text=HTML_prep.open_html_file(FilePath)
    
    if text:
        All_divs,Title=HTML_prep.return_content_soup(text)
        h1,h2,h3,linkName,LinkUrl=HTML_prep.get_HTML_elements_from_soup(All_divs)
    
        try:
            #text=get_text_from_soup_simple_split(All_divs) 
            text=HTML_prep.get_text_from_soup_with_nltk(All_divs)
        except:
            print(f"error with: {FilePath} - read HTML only", end=". ")
            text=HTML_prep.get_text_from_html(text)

    return pd.Series([Title,h1,h2,h3,text,linkName,linkUrl])


#df.file[2:6].apply(FileInfo)
print("This takes about 3 minutes")
print_time()        
df[["HTML_Title","h1","h2","h3","HTML_text","linkName","linkUrl"]]=df.filepath.progress_apply(extract_HTML)
print_time()

In [None]:
df.to_json("2023-09-12_Uni_Files_NLP.json")

In [None]:
df=pd.read_json("2023-09-12_Uni_Files_NLP.json")


In [None]:
df["HTML_text"][0][0]

# Finalize the text before running the NLP stuff

In [None]:
# Unify & drop columns.

df.loc[df["HTML_text"].apply(lambda x: x is not None), "HTML"]=True
df["HTML"]=df["HTML"].fillna(False)

df.loc[df["HTML"], "text"]=df.HTML_text
df.loc[df["HTML"], "title"]=df.HTML_Title
df.loc[df["HTML"], "links"]=df.linkUrl

df.language=df.language.replace(["en-US","en-GB","en-us","en_US","en_US","English"],"en")
df.language=df.language.replace(["nl","nl-NL","nederlands","nl-nl"],"nl")
df.language=df.language.replace("",None)

df.drop_duplicates(subset="text",inplace=True)
df.dropna(subset="text",inplace=True)

#df.drop(columns=["HTML_text","HTML_Title","linkUrl","parentItem","h1","h2","h3"],inplace=True)


In [None]:
def Insert_title(text,Title):
    if Title!=None and type(text)==list:
        text=[Title.strip()]+text
        return text
    else:
        return text


# I dont run this #    
#df["text"]=df[["text","title"]].apply(lambda x: Insert_title(*x),axis=1)

# Edit & split text into paragraphs & sentences

In [None]:
df[df["text"].apply(lambda x: isinstance(x,list)!=True)]

In [None]:
def split_long_paragraphs(paragraphs,
                          max_char_in_paragraph=5000,
                          min_char_in_paragraph=15,
                          max_char_in_sentence=5000,
                          min_char_in_sentence=4):
    removed_sentences=[]
    removed_paragraphs=[]
    sentences_out=[]
    paragraphs_out=[]
    
    if isinstance(paragraphs,list):
        
        for paragraph in paragraphs:
            paragraph_out=[]      
            
            from nltk.tokenize import sent_tokenize  
            
            #e NLTK sentencer cannot handle A.I. 
            paragraph=paragraph.replace("A.I.","AI").replace("e.g.","eg")
            
            sentences=sent_tokenize(paragraph) # per paragraph
            
            counter = 0
            P="" # will be filled with sentences untill it has 5000 characters.
            
            for s in sentences:
                Only_string_len=len(s.replace("[^a-zA-Z]",""))
                if (Only_string_len<=min_char_in_sentence)|(Only_string_len>max_char_in_sentence):
                    removed_sentences.append(s)
                else:
                    sentences_out.append(s)

                counter += len(s) # calculate the lenght if it would be added.

                if counter > max_char_in_paragraph: # does not add the sentence to the paragraph if too long
                    
                    paragraph_out.append(P)
                    
                    L=len(paragraph_out[-1])
                    
                    print(f"{L} + {len(s)} = {counter} --> split {len(paragraph_out)} times.")
                    
                    counter=0
                    P=""
                    
                if counter < max_char_in_paragraph:
                    P=P.strip()+" "+ s.strip()
                    
            if len(P)>=min_char_in_paragraph:
                # minipal lenght for a paragraph
                paragraph_out.append(P.strip())
                paragraphs_out.extend(paragraph_out)
                
            if len(P)<=min_char_in_paragraph:
                removed_paragraphs.append(P)
    
    if False: ## Do I want this info?
    
        if len(removed_sentences)>0:
            print("removed sentences:")
            print(removed_sentences)
        if len(removed_paragraphs)>0:
            print("removed paragraphs:")
            print(removed_paragraphs)  
        
        
    sentences_len=[len(i) for i in sentences_out]
    
    paragraphs_len=[len(i) for i in paragraphs_out]
 
        
    return pd.Series([paragraphs_out,sentences_out,paragraphs_len,sentences_len,removed_paragraphs])


df[["paragraphs","sentences","paragraphs_len","sentences_len","removed_paragraphs"]]=df["text"].apply(split_long_paragraphs, max_char_in_paragraph=4300)
#df["paragraph_sum"]=df.paragraphs_len.apply(lambda x: sum(x))


In [None]:
print(df.removed_paragraphs.explode().dropna().to_list()[:40])

In [None]:
# Check what sentences are almost as long as the max setting

sss=df[df.sentences.apply(lambda x: any([len(i)>2900 for i in x]))]["sentences"]
for s in sss:
    for e,i in enumerate(s):
        if len(i)>800:
            print(e,i,len(i))

In [None]:
x=df.explode("sentences_len")

x[x.sentences_len<1000]["sentences_len"].plot.hist(bins=40)
#x["sentences_len"]

In [None]:
x=df.explode("paragraphs_len")

x[x.paragraphs_len<5000]["paragraphs_len"].plot.hist(bins=60)

In [None]:
d(20)
df[["paragraphs","sentences","paragraphs_len","sentences_len"]]

In [None]:
# Check the lenght of paragraphs
d(5000)

# manually remove this -- its too long.
df=df[df.title!='Research Posters - Faculty of Geosciences - Utrecht University']


df[df.sentences_len.apply(lambda x: any(i > 4000 for i in x))]

In [None]:
df[df["paragraphs_len"].apply(lambda x: any([p<10 for p in x]))]["paragraphs_len"]


In [None]:
# Check if there is one column longer than what google translates.
df[df[column].apply(len)>4900]

# RUN THE NLP PIPELINE

In [None]:
test=df.head(50).copy()

print("TEST RUN")

source_column="text" #or "paragraphs"
translated_column="text"#_translated"
test=test.explode(source_column)

test[["text_translated","source_language"]]=test.apply(lambda x: nlp.GoogleTrans(x[source_column],x["language"]), axis=1)

test=nlp.NLP_Pipeline(test, text_column=translated_column, sentiment=False, metaphors=False)

d(200)
test["NoStopwords"]

In [None]:
# translate

df=df.explode("paragraphs")

df[["text_translated","source_language"]]=df.progress_apply(lambda x: nlp.GoogleTrans(x["paragraphs"],x["language"]), axis=1)

df['P_counter'] = df.groupby('key').cumcount()
df["key_P"]=df.apply(lambda x: str(x.key)+"_"+str(x.P_counter), axis=1)




In [None]:
relevant_columns=["key","key_P","url","filename","Uni",'itemType',"date",\
                  "language","filepath","links","text_translated","source_language"]
df=df[relevant_columns]
df.rename(columns={"text_translated":"text"},inplace=True
          
df.reset_index(inplace=True)
df.to_json("2023-09-12_ChatGPT_translated.json")


In [2]:
df=pd.read_json("2023-09-12_ChatGPT_translated.json")

df.head()

Unnamed: 0,key,key_P,url,filename,Uni,itemType,date,language,filepath,links,text,source_language
0,KYXS34U3,KYXS34U3_0,https://www.ru.nl/en/cls/clst,clst.html,Radboud University,webpage,2023-06-28,en,C:\Users\mr\Zotero\storage\KYXS34U3\clst.html,,Centre for Language and Speech Technology We a...,en
1,KYXS34U3,KYXS34U3_1,https://www.ru.nl/en/cls/clst,clst.html,Radboud University,webpage,2023-06-28,en,C:\Users\mr\Zotero\storage\KYXS34U3\clst.html,,The amount of information available in our dig...,en
2,KYXS34U3,KYXS34U3_2,https://www.ru.nl/en/cls/clst,clst.html,Radboud University,webpage,2023-06-28,en,C:\Users\mr\Zotero\storage\KYXS34U3\clst.html,,Playfully practicing speaking and reading 28 J...,en
3,KYXS34U3,KYXS34U3_3,https://www.ru.nl/en/cls/clst,clst.html,Radboud University,webpage,2023-06-28,en,C:\Users\mr\Zotero\storage\KYXS34U3\clst.html,,Learning to read better with software that lis...,en
4,KYXS34U3,KYXS34U3_4,https://www.ru.nl/en/cls/clst,clst.html,Radboud University,webpage,2023-06-28,en,C:\Users\mr\Zotero\storage\KYXS34U3\clst.html,,HoMed (Homo Medicinalis) Research HoMed (Homo ...,en


In [3]:
relevant_columns=["key","key_P","url","filename","Uni",'itemType',"date",\
                  "language","filepath","links","text","source_language"]

df=df[relevant_columns]
df=nlp.NLP_Pipeline(df, text_column="text", sentiment=False, metaphors=False)

Current Time = 00:18:56
len:  4897
splitting to sentences.
len:  32964
Token & Lemmatizing & stopword removal & modal_word.


100%|████████████████████████████████████████████████████████████████████████████| 32964/32964 [50:23<00:00, 10.90it/s]


Current Time = 01:09:48


In [21]:
d(5)

print(url)

Series([], Name: url, dtype: object)


In [22]:
from transformers import pipeline
from nltk.corpus import stopwords

model="cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_pipe = pipeline("sentiment-analysis", model=model, max_length=512, truncation=True)

print("Stopwording done. Next: sentiment.")                              
df["sentiment"] = df.progress_apply(nlp.roberta_sentiment,column="sentences", axis=1,pipe=sentiment_pipe)



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Stopwording done. Next: sentiment.


100%|████████████████████████████████████████████████████████████████████████████| 32963/32963 [41:07<00:00, 13.36it/s]


In [23]:

print("Now - metaphors.")  

stop_words= list(set(stopwords.words('english')))


metaphor_pipe = pipeline("token-classification", model="CreativeLang/metaphor_detection_roberta_seq")

df["metaphors"] = df.progress_apply(nlp.classify_metaphors, axis=1,column="sentences",stop_words=stop_words,pipe=metaphor_pipe)
df["metaphors_n"] = df.metaphors.apply(lambda x: len(x))

Now - metaphors.


100%|████████████████████████████████████████████████████████████████████████████| 32963/32963 [41:27<00:00, 13.25it/s]


In [24]:

df.reset_index(inplace=True)
df.to_json("2023-09-14_ChatGPT_NLP_met_sent.json")