In [None]:
import pandas as pd

import MyLib.nlp as nlp
import MyLib.HTML_prep as HTML_prep
import MyLib.analysis as analysis 
%load_ext autoreload
%autoreload 2

## My API key  & library_Id are stored in another file
api_key,library_id = pd.read_json("Zotero_API_key.json", typ='series')

library_type="group"

def print_time():
    from datetime import datetime
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)


from pyzotero import zotero
zot = zotero.Zotero(library_id, library_type, api_key)

ID_Universities='EWSYI3RS' # to access the files in the university subfolder.
ID_SURF='WPXRQVIU'

In [None]:
helpDict={}

for i in zot.collections_sub(ID_Universities):
    name,key=i["data"]["name"],i["data"]["key"]
    print(f"name: {name} --> key: {key}")
    collection_items=zot.collection_items(key)
    helpDict.update({k["key"]:k["data"]|{"Uni":name} for k in collection_items})
    
#Add Surf
name="surf"
key=ID_SURF
print(f"name: {name} --> key: {key}")
collection_items=zot.collection_items(key)
helpDict.update({k["key"]:k["data"]|{"Uni":name} for k in collection_items})

In [None]:
df=pd.DataFrame(helpDict).T

In [None]:
Claimed_Parents=df["parentItem"].dropna().to_list()
Real_Parents=df.index.to_list()
WithoutParents=list(set(Claimed_Parents)-set(Real_Parents))
WithoutParents

In [None]:
Parents_columns=['title', 'date','itemType','language']
WithParents_columns=["key",'parentItem','url','filename',"Uni"]

WithParents=df.dropna(subset="parentItem")[WithParents_columns]
Parents=df[df.itemType.apply(lambda x: x!="attachment")][Parents_columns]

df=WithParents.merge(Parents, left_on="parentItem",right_index=True, how="right")
df.head(5)

#df2[df2.filename.isna()].itemType

In [None]:
df.Uni.value_counts()

In [None]:
# Correct item type of uTwente Thesies
df.loc[df.url.apply(lambda x: "essay.utwente.nl" in str(x)),"itemType"]="thesis"
print(df.itemType.value_counts())

In [None]:
# remove not useful items
DropTypeList=["journalArticle","conferencePaper","thesis","book","bookSection","note","film","dataset"]

df=df[~df["itemType"].isin(DropTypeList)]

print(df.itemType.value_counts())

In [None]:
def fileName(key,filename):
    DIR=f"C:\\Users\\mr\\Zotero\\storage\\{key}\\"
    filename=DIR+str(filename)
    return filename

df["filepath"]=df.apply(lambda x: fileName(x.key, x.filename), axis=1).drop_duplicates()

print(df.filepath[0],print(len(df)))

In [None]:
df.to_json("2023-07-30_Uni_Files_raw.json")
# CHeck for duplicates / errors in Zotero: 
df[df.index.duplicated()]

In [None]:
df=pd.read_json("2023-07-27_Uni_Files_raw.json")

# Add Content from PDFs

In [None]:
def getLinksfromPDF(page):
    links=[]

    if '/Annots' in page.keys():
        annotations=page['/Annots']
        for a in annotations:
            a=a.get_object()
            if '/A' in a:
                A=a['/A']
                if '/URI' in A.keys():
                    link=(A['/URI'])
                    links.append(link)
    return links

def getDate(pdf_reader):
    mod_date=None
    metaData=pdf_reader.metadata
    if '/ModDate' in metaData.keys():
        #print(metaData.keys())
        mod_date=metaData['/ModDate'][2:10]
        dtformat = "%Y%m%d"
        mod_date=pd.to_datetime(mod_date,format=dtformat)
        #print(creation_date)
    return mod_date
    
def extract_text_with_pyPDF(filepath,MaxPages=20):
    
    pages,links,mod_date=[],[],None
    from pypdf import PdfReader
    
    pdf_reader = PdfReader(filepath)
    mod_date=getDate(pdf_reader)
    
    S_pages=pdf_reader.pages
    if len(S_pages)>MaxPages:
        f=filepath.split("\\")[-1]
        #print(f"{f} has more than {MaxPages} pages: {len(S_pages)}. Only processing {MaxPages} pages.")
        S_pages=S_pages[:MaxPages]

    for i, page in enumerate(S_pages):
        raw_text = ""
        try:
            text = page.extract_text()
            if text:
                raw_text += text
                pages.append(raw_text)
        except Exception as error:
            print("text-problems with: ", filepath)
            print("\n", error)
            
    for i, page in enumerate(S_pages):
        links=getLinksfromPDF(page)

    return pd.Series([pages,links,mod_date])

    
PDFs=df.loc[df.filepath.apply(lambda x: x.split(".")[-1]=="pdf"),["filepath","date"]]    
NumberOfPDFs=len(PDFs)    

print(f"the dataset containts {NumberOfPDFs} PDF-files.")

df[["text","links","mod_date"]]=PDFs.filepath.apply(extract_text_with_pyPDF,MaxPages=150)

# Use filemod as date.
df.loc[~df.mod_date.isna(),["date"]]=df.mod_date
df.drop(columns='mod_date',inplace=True)

#problemfile.filepath.apply(extract_text_with_pyPDF,MaxPages=10)

# Add Content from HTMLs

In [None]:
def find_date(url, date):
    from htmldate import find_date
    from numpy import nan
    if type(date)!=pd.Timestamp:
        try:
            date=pd.to_datetime(find_date(url))
        except:
            date=nan
    return date


print("This takes about: 3 minutes.")

df["date"]=df.date.apply(pd.to_datetime,errors='coerce')
print_time()     

df["date"]=df.apply(lambda x: find_date(x.url, x.date), axis=1)
print_time()

df.date.apply(lambda x: type(x)==pd.Timestamp).value_counts()

In [None]:
df.to_json("2023-07-30_Uni_Files_raw_date.json")

In [None]:
df=pd.read_json("2023-07-30_Uni_Files_raw_date.json")

In [None]:
df.head(3)

In [None]:
def FileInfo(FilePath):
    Title,h1,h2,h3,text,linkName,linkUrl=None,None,None,None,None,None,None

    if FilePath.endswith(".html"):
           
        try:
            #print(FilePath)

            with open(FilePath,"r", encoding='utf-8') as f:
                text= f.read()
            
        except:
            print("error opening the html file. File does not exist?")
            return
        
        
        try:
            from bs4 import BeautifulSoup

            soup = BeautifulSoup(text, "html5lib") #'html.parser')       
            Title=soup.title.text
            #soup=HTML_prep.removeIMGs(soup,IMG_filename=Title.replace(" ","_"))
            
            TextIndicators=soup("p")+soup("h5")+soup("h4")+soup("h3")+soup("h2")+soup("h1")

            All_divs=[p.parent for p in TextIndicators if p.parent.name in ["div","main","section","article","center","td"]]
     
            All_divs=list(dict.fromkeys(All_divs))
            
            
            # Remove images
            try:
                IMG_filename="".join(x for x in Title.strip().replace(" ","_") if x.isalnum() or x=="_")[:100]
                All_divs=[HTML_prep.removeIMGs(div,write_img=False,IMG_filename=IMG_filename) for div in All_divs]        
            except:
                print(f"error removing images: {FilePath}")
                
            h1=[item for sublist in [[i.text for i in div("h1")] for div in All_divs if div("h1")!=None] for item in sublist]
            h2=[item for sublist in [[i.text for i in div("h2")] for div in All_divs if div("h2")!=None] for item in sublist]
            h3=[item for sublist in [[i.text for i in div("h3")] for div in All_divs if div("h3")!=None] for item in sublist]
        

            # here only get all divs
            #text=[item for sublist in [[i.text for i in div("p")] for div in All_divs if div("p")!=None] for item in sublist]
            
            # here gets all human readable text parts --> includes headlines. the \n \n keeps distance to headlines
           
            linkName=[item for sublist in [[i.text for i in div("a")] for div in All_divs if div("a")!=None] for item in sublist]
            linkUrl=[item for sublist in [[i.get('href') for i in div("a")] for div in All_divs if div("a")!=None] for item in sublist]
            #i=i("p")+i("h5")+i("h4")+i("h3")+i("h2")+i("h1")
            text=[i.get_text(separator=u' ').replace(".",". ").replace("\n ",". ") for i in All_divs if i.get_text()!=None]

        except:
            print(f"error with: {FilePath}", end=". ")
            
            try:
                text=text.split("\n\n")
 
                print("--> text from reading as a text file.")
            except:
                print("also no text file")
                
        #leave out words longer than 100 characters to avoid undetected embedded images and other shit.
        text=[" ".join([y.replace("\n"," ").strip() for y in i.split(" ") if len(y)<100]) for i in text] 
       # text=["".join([y for y in i if len(y)<100]) for i in text]
    
    return pd.Series([Title,h1,h2,h3,text,linkName,linkUrl])
        
#df.file[2:6].apply(FileInfo)
print("This takes about 4 minutes")
print_time()        
df[["HTML_Title","h1","h2","h3","HTML_text","linkName","linkUrl"]]=df.filepath.apply(FileInfo)
print_time()

In [None]:
df["filepath"][DoubleIndex]#.apply(FileInfo)

In [None]:
# Unify & drop columns.

df.loc[df["HTML_text"].apply(lambda x: x is not None), "HTML"]=True
df["HTML"]=df["HTML"].fillna(False)

df.loc[df["HTML"], "text"]=df.HTML_text
df.loc[df["HTML"], "title"]=df.HTML_Title
df.loc[df["HTML"], "links"]=df.linkUrl

df.language=df.language.replace(["en-US","en-GB","en-us","en_US","en_US","English"],"en")
df.language=df.language.replace(["nl","nl-NL","nederlands","nl-nl"],"nl")
df.language=df.language.replace("",None)

#improve text...
df.text=df.text.fillna("").apply(lambda l: [s.replace("..",". ").replace(". . ","").lstrip(". ") for s in l if isinstance(s,str)])

df.drop(columns=["HTML_text","HTML_Title","linkUrl","key","parentItem","h1","h2","h3"],inplace=True)


In [None]:
pd.set_option('display.max_colwidth', None)

DoubleIndex=df[df.text.duplicated()].index
#df=df.drop_duplicates(subset="text")
df.filepath[DoubleIndex]

In [None]:
df.to_json("2023-07-30_Uni_Files_NLP.json")

In [None]:
df=pd.read_json("2023-07-30_Uni_Files_NLP.json")



In [None]:
def Insert_title(text,Title):
    if Title!=None and type(text)==list:
        text=[Title.strip()]+text
        return text
    else:
        return text


df["text"]=df[["text","title"]].apply(lambda x: Insert_title(*x),axis=1)

In [None]:
def Splitter(paragraphs, max_len=5000):
    
    sentences=[]
    paragraphs_out=[]
    if isinstance(paragraphs,list):
        
        for paragraph in paragraphs:
            paragraph_out=[] 
       
            from nltk.tokenize import sent_tokenize
            

            
        
            paragraph=paragraph.replace("\n",". ").replace("!"," ").replace("?"," ").replace("..",". ").replace(". . ","")
            
            
            
            
            # fill in spaces after dot if followed by capital character
            import re
            regex = r"(?<=[a-zA-Z])\.(?=[A-Z][a-z])"
            subst = ". "
            paragraph = re.sub(regex, subst, paragraph, 0, re.MULTILINE)
            paragraph=paragraph.replace("  "," ").replace(" .",".").lstrip(". ")
            
            sentence=sent_tokenize(paragraph) # per paragraph
            sentence=[s for s in sentence if len(s)>1]

            counter = 0
            P="" # will be filled with sentences untill it has 5000 characters.
            for s in sentence:
                counter += len(s) # calculate the lenght if it would be added.

                if counter > max_len: # does not add the sentence to the paragraph if too long
                    
                    paragraph_out.append(P)
                    
                    L=len(paragraph_out[-1])
                    print(f"{L} + {len(s)} = {counter} --> split {len(paragraph_out)} times.")
                
                    counter=0
                    P=""
                    
                if counter < max_len:
                    P=P.strip()+" "+ s.strip()
            
            paragraph_out.append(P)

            sentences+=sentence
            
            paragraphs_out.extend(paragraph_out)
                
                
    sentences_len=[len(i) for i in sentences]
    paragraphs_len=[len(i) for i in paragraphs_out]
 
        
    return pd.Series([paragraphs_out,sentences,paragraphs_len,sentences_len])


df[["paragraphs","sentences","paragraphs_len","sentences_len"]]=df["text"].apply(Splitter, max_len=4500)
df["paragraph_sum"]=df.paragraphs_len.apply(lambda x: sum(x))

#df["text"].apply(Splitter)

In [None]:
# Check the lenght of paragraphs

df[df.paragraphs_len.apply(lambda x: any(i > 5000 for i in x))]

In [None]:

# just checkin if the long paragraphs are necessary...
df=df[df.title!='Research Posters - Faculty of Geosciences - Utrecht University']

#df=df[df["paragraph_sum"]<100000]


pd.set_option('display.max_colwidth', 255)

df[df["paragraph_sum"]>100000].filepath

In [None]:
df.to_json("2023-07-30_Uni_Files_NLP_splitter.json")


In [None]:
df=pd.read_json("2023-07-30_Uni_Files_NLP_splitter.json")

In [None]:
df.columns

# Explode --> by sentence or by paragraph?

In [None]:
#df=df.explode("sentences").drop_duplicates(subset="sentences").reset_index(drop=True)

df.paragraphs.explode()

In [None]:
df.language.value_counts()

### I will filter in the analysis file. Therefore this code is not needed anymore...

ChatGPT_Terms="ChatGPT, Chat-GPT, GPT3, GPT-3, GPT-x, GPT-4, GPT4,\
Transformer, OpenAI, AI, hallucination, Text generation, LLM, GPT, Chatbot, Models, generative, Intelligence, Model"

import MyLib.nlp as nlp

df["AI_paragraphs"]=df.paragraphs.dropna().apply(nlp.filter_paragraphs,by=ChatGPT_Terms).dropna()

df["AI_Paragraphs_len"]=df["AI_paragraphs"].apply(lambda x: [len(i) for i in x])
df[df.AI_Paragraphs_len.apply(lambda x: any(i > 4999 for i in x))]


In [None]:
column="sentences"

#df=df[df.file.apply(lambda x: x.endswith("html"))]

df=df.explode(column).reset_index(drop=True)
df=df[df[column].apply(lambda x: type(x)==str)]
df=df.drop_duplicates(column)
df["LEN"]=df[column].apply(lambda x: len(x.split(" ")))
print(len(df))

In [None]:
df.text[:2]

In [None]:
#df[df.sentences.apply(lambda x: len(x.replace('[^a-zA-Z]', ''))<10)]["sentences"]

In [None]:
#remove tables of content
df=df[df.sentences.apply(lambda x: x.count(".")<10)]

#remove non-sense sentences by: if sentence is shorter than 10 characters without numbers.
df=df[df.sentences.apply(lambda x: len(x.replace('[^a-zA-Z]', ''))>10)]

df=df[df.sentences.apply(len)>15]



In [None]:
pd.set_option('display.max_colwidth', None)
PDF=df[df.HTML==False]

# test what the pdf sentences with dots look like now.

PDF[PDF.sentences.apply(lambda x: x.count(".")>8)].sentences.head(5)

In [None]:
# Check: remove short strings that dont resemble sentences.

n=16 # ab 17
c=df[df.sentences.apply(len)<n]
print("LEN: ", len(c))      
[print(i) for i in df[df.sentences.apply(len)<n].sentences]

In [None]:
df[df.sentences.apply(len)>4500]

In [None]:
df.columns

# RUN THE NLP PIPELINE

In [None]:
# test

test=pd.DataFrame(df.head(5).to_dict())
test["t"]=test.url
print(pd. __version__)

test=nlp.NLP_Pipeline(test, text_column=column, target_language="en",sentiment=False)
test["source_language"]

In [None]:
test.head(20).sentences

In [None]:
## takes approximately 120 minutes for translation (last time)
df=nlp.NLP_Pipeline(df, text_column=column, target_language="en",sentiment=False)

In [None]:
df.columns

In [None]:
df.to_json("2023-07-27_ChatGPT_Sentences_NLP-Out.json")

In [None]:
df.columns

df2=df[['Uni','FileKey','url','date', 'Title','linkName', 'linkUrl', 'AI_paragraphs','text_clean', 'letters_count', 'word_count',
       'language', 'source_language', 'pure_text', 'Lemmata', 'NoStopwords']]

In [None]:
df2.to_json("2023-06-06_Zotero_AI_nlp_en2.json")