In [2]:
import pandas as pd

import MyLib.nlp as nlp
import MyLib.HTML_prep as HTML_prep
import MyLib.analysis as analysis 
%load_ext autoreload
%autoreload 2

## My API key  & library_Id are stored in another file
api_key,library_id = pd.read_json("Zotero_API_key.json", typ='series')

library_type="group"

def print_time():
    from datetime import datetime
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)


from pyzotero import zotero
zot = zotero.Zotero(library_id, library_type, api_key)

ID_Universities='EWSYI3RS' # to access the files in the university subfolder.
ID_SURF='WPXRQVIU'

In [None]:
helpDict={}

for i in zot.collections_sub(ID_Universities):
    name,key=i["data"]["name"],i["data"]["key"]
    print(f"name: {name} --> key: {key}")
    collection_items=zot.collection_items(key)
    helpDict.update({k["key"]:k["data"]|{"Uni":name} for k in collection_items})
    
#Add Surf
name="surf"
key=ID_SURF
print(f"name: {name} --> key: {key}")
collection_items=zot.collection_items(key)
helpDict.update({k["key"]:k["data"]|{"Uni":name} for k in collection_items})

In [None]:
df=pd.DataFrame(helpDict).T

In [None]:
Claimed_Parents=df["parentItem"].dropna().to_list()
Real_Parents=df.index.to_list()
WithoutParents=list(set(Claimed_Parents)-set(Real_Parents))
WithoutParents

In [None]:
Parents_columns=['title', 'date','itemType','language']
WithParents_columns=["key",'parentItem','url','filename',"Uni"]

WithParents=df.dropna(subset="parentItem")[WithParents_columns]
Parents=df[df.itemType.apply(lambda x: x!="attachment")][Parents_columns]

df=WithParents.merge(Parents, left_on="parentItem",right_index=True, how="right")
df.head(5)

#df2[df2.filename.isna()].itemType

In [None]:
df.Uni.value_counts()

In [None]:
# Correct item type of uTwente Thesies
df.loc[df.url.apply(lambda x: "essay.utwente.nl" in str(x)),"itemType"]="thesis"
print(df.itemType.value_counts())

In [None]:
# remove not useful items
DropTypeList=["journalArticle","conferencePaper","thesis","book","bookSection","note","film","dataset"]

df=df[~df["itemType"].isin(DropTypeList)]

print(df.itemType.value_counts())

In [None]:
def fileName(key,filename):
    DIR=f"C:\\Users\\mr\\Zotero\\storage\\{key}\\"
    filename=DIR+str(filename)
    return filename

df["filepath"]=df.apply(lambda x: fileName(x.key, x.filename), axis=1).drop_duplicates()

print(df.filepath[0],print(len(df)))

In [None]:
df.to_json("2023-07-30_Uni_Files_raw.json")
# CHeck for duplicates / errors in Zotero: 
df[df.index.duplicated()]

In [None]:
df=pd.read_json("2023-07-27_Uni_Files_raw.json")

# Add Content from PDFs

In [None]:
def getLinksfromPDF(page):
    links=[]

    if '/Annots' in page.keys():
        annotations=page['/Annots']
        for a in annotations:
            a=a.get_object()
            if '/A' in a:
                A=a['/A']
                if '/URI' in A.keys():
                    link=(A['/URI'])
                    links.append(link)
    return links

def getDate(pdf_reader):
    mod_date=None
    metaData=pdf_reader.metadata
    if '/ModDate' in metaData.keys():
        #print(metaData.keys())
        mod_date=metaData['/ModDate'][2:10]
        dtformat = "%Y%m%d"
        mod_date=pd.to_datetime(mod_date,format=dtformat)
        #print(creation_date)
    return mod_date
    
def extract_text_with_pyPDF(filepath,MaxPages=20):
    
    pages,links,mod_date=[],[],None
    from pypdf import PdfReader
    
    pdf_reader = PdfReader(filepath)
    mod_date=getDate(pdf_reader)
    
    S_pages=pdf_reader.pages
    if len(S_pages)>MaxPages:
        f=filepath.split("\\")[-1]
        #print(f"{f} has more than {MaxPages} pages: {len(S_pages)}. Only processing {MaxPages} pages.")
        S_pages=S_pages[:MaxPages]

    for i, page in enumerate(S_pages):
        raw_text = ""
        try:
            text = page.extract_text()
            if text:
                raw_text += text
                pages.append(raw_text)
        except Exception as error:
            print("text-problems with: ", filepath)
            print("\n", error)
            
    for i, page in enumerate(S_pages):
        links=getLinksfromPDF(page)

    return pd.Series([pages,links,mod_date])

    
PDFs=df.loc[df.filepath.apply(lambda x: x.split(".")[-1]=="pdf"),["filepath","date"]]    
NumberOfPDFs=len(PDFs)    

print(f"the dataset containts {NumberOfPDFs} PDF-files.")

df[["text","links","mod_date"]]=PDFs.filepath.apply(extract_text_with_pyPDF,MaxPages=150)

# Use filemod as date.
df.loc[~df.mod_date.isna(),["date"]]=df.mod_date
df.drop(columns='mod_date',inplace=True)

#problemfile.filepath.apply(extract_text_with_pyPDF,MaxPages=10)

# Add Content from HTMLs

In [None]:
def find_date(url, date):
    from htmldate import find_date
    from numpy import nan
    if type(date)!=pd.Timestamp:
        try:
            date=pd.to_datetime(find_date(url))
        except:
            date=nan
    return date


print("This takes about: 3 minutes.")

df["date"]=df.date.apply(pd.to_datetime,errors='coerce')
print_time()     

df["date"]=df.apply(lambda x: find_date(x.url, x.date), axis=1)
print_time()

df.date.apply(lambda x: type(x)==pd.Timestamp).value_counts()

In [None]:
df.to_json("2023-07-30_Uni_Files_raw_date.json")

In [3]:
df=pd.read_json("2023-07-30_Uni_Files_raw_date.json")

In [None]:
df.head(3)

In [None]:
def FileInfo(FilePath):
    Title,h1,h2,h3,text,linkName,linkUrl=None,None,None,None,None,None,None

    if FilePath.endswith(".html"):
           
        try:
            #print(FilePath)

            with open(FilePath,"r", encoding='utf-8') as f:
                text= f.read()
            
        except:
            print("error opening the html file. File does not exist?")
            return
        
        
        try:
            from bs4 import BeautifulSoup

            soup = BeautifulSoup(text, "html5lib") #'html.parser')       
            Title=soup.title.text
            #soup=HTML_prep.removeIMGs(soup,IMG_filename=Title.replace(" ","_"))
            
            TextIndicators=soup("p")+soup("h5")+soup("h4")+soup("h3")+soup("h2")+soup("h1")

            All_divs=[p.parent for p in TextIndicators if p.parent.name in ["div","main","section","article","center","td"]]
     
            All_divs=list(dict.fromkeys(All_divs))
            
            
            # Remove images
            try:
                IMG_filename="".join(x for x in Title.strip().replace(" ","_") if x.isalnum() or x=="_")[:100]
                All_divs=[HTML_prep.removeIMGs(div,write_img=False,IMG_filename=IMG_filename) for div in All_divs]        
            except:
                print(f"error removing images: {FilePath}")
                
            h1=[item for sublist in [[i.text for i in div("h1")] for div in All_divs if div("h1")!=None] for item in sublist]
            h2=[item for sublist in [[i.text for i in div("h2")] for div in All_divs if div("h2")!=None] for item in sublist]
            h3=[item for sublist in [[i.text for i in div("h3")] for div in All_divs if div("h3")!=None] for item in sublist]
        

            # here only get all divs
            #text=[item for sublist in [[i.text for i in div("p")] for div in All_divs if div("p")!=None] for item in sublist]
            
            # here gets all human readable text parts --> includes headlines. the \n \n keeps distance to headlines
           
            linkName=[item for sublist in [[i.text for i in div("a")] for div in All_divs if div("a")!=None] for item in sublist]
            linkUrl=[item for sublist in [[i.get('href') for i in div("a")] for div in All_divs if div("a")!=None] for item in sublist]
            #i=i("p")+i("h5")+i("h4")+i("h3")+i("h2")+i("h1")
            text=[i.get_text(separator=u' ').replace(".",". ").replace("\n ",". ") for i in All_divs if i.get_text()!=None]

        except:
            print(f"error with: {FilePath}", end=". ")
            
            try:
                text=text.split("\n\n")
 
                print("--> text from reading as a text file.")
            except:
                print("also no text file")
                
        #leave out words longer than 100 characters to avoid undetected embedded images and other shit.
        text=[" ".join([y.replace("\n"," ").strip() for y in i.split(" ") if len(y)<100]) for i in text] 
       # text=["".join([y for y in i if len(y)<100]) for i in text]
    
    return pd.Series([Title,h1,h2,h3,text,linkName,linkUrl])
        
#df.file[2:6].apply(FileInfo)
print("This takes about 4 minutes")
print_time()        
df[["HTML_Title","h1","h2","h3","HTML_text","linkName","linkUrl"]]=df.filepath.apply(FileInfo)
print_time()

In [None]:
df["filepath"][DoubleIndex]#.apply(FileInfo)

In [None]:
# Unify & drop columns.

df.loc[df["HTML_text"].apply(lambda x: x is not None), "HTML"]=True
df["HTML"]=df["HTML"].fillna(False)

df.loc[df["HTML"], "text"]=df.HTML_text
df.loc[df["HTML"], "title"]=df.HTML_Title
df.loc[df["HTML"], "links"]=df.linkUrl

df.language=df.language.replace(["en-US","en-GB","en-us","en_US","en_US","English"],"en")
df.language=df.language.replace(["nl","nl-NL","nederlands","nl-nl"],"nl")
df.language=df.language.replace("",None)

#improve text...
df.text=df.text.fillna("").apply(lambda l: [s.replace("..",". ").replace(". . ","").lstrip(". ") for s in l if isinstance(s,str)])

df.drop(columns=["HTML_text","HTML_Title","linkUrl","key","parentItem","h1","h2","h3"],inplace=True)


In [None]:
pd.set_option('display.max_colwidth', None)

DoubleIndex=df[df.text.duplicated()].index
#df=df.drop_duplicates(subset="text")
df.filepath[DoubleIndex]

In [None]:
df.to_json("2023-07-30_Uni_Files_NLP.json")

In [4]:
df=pd.read_json("2023-07-30_Uni_Files_NLP.json")



In [5]:
def Insert_title(text,Title):
    if Title!=None and type(text)==list:
        text=[Title.strip()]+text
        return text
    else:
        return text


df["text"]=df[["text","title"]].apply(lambda x: Insert_title(*x),axis=1)

In [6]:
def Splitter(paragraphs, max_len=5000):
    
    sentences=[]
    paragraphs_out=[]
    if isinstance(paragraphs,list):
        
        for paragraph in paragraphs:
            paragraph_out=[] 
       
            from nltk.tokenize import sent_tokenize
            

            
        
            paragraph=paragraph.replace("\n",". ").replace("!"," ").replace("?"," ").replace("..",". ").replace(". . ","")
            
            
            
            
            # fill in spaces after dot if followed by capital character
            import re
            regex = r"(?<=[a-zA-Z])\.(?=[A-Z][a-z])"
            subst = ". "
            paragraph = re.sub(regex, subst, paragraph, 0, re.MULTILINE)
            paragraph=paragraph.replace("  "," ").replace(" .",".").lstrip(". ")
            
            sentence=sent_tokenize(paragraph) # per paragraph
            sentence=[s for s in sentence if len(s)>1]

            counter = 0
            P="" # will be filled with sentences untill it has 5000 characters.
            for s in sentence:
                counter += len(s) # calculate the lenght if it would be added.

                if counter > max_len: # does not add the sentence to the paragraph if too long
                    
                    paragraph_out.append(P)
                    
                    L=len(paragraph_out[-1])
                    print(f"{L} + {len(s)} = {counter} --> split {len(paragraph_out)} times.")
                
                    counter=0
                    P=""
                    
                if counter < max_len:
                    P=P.strip()+" "+ s.strip()
            
            paragraph_out.append(P)

            sentences+=sentence
            
            paragraphs_out.extend(paragraph_out)
                
                
    sentences_len=[len(i) for i in sentences]
    paragraphs_len=[len(i) for i in paragraphs_out]
 
        
    return pd.Series([paragraphs_out,sentences,paragraphs_len,sentences_len])


df[["paragraphs","sentences","paragraphs_len","sentences_len"]]=df["text"].apply(Splitter, max_len=4500)
df["paragraph_sum"]=df.paragraphs_len.apply(lambda x: sum(x))

#df["text"].apply(Splitter)

4037 + 580 = 4598 --> split 1 times.
5013 + 321 = 4725 --> split 2 times.
4521 + 49 = 4533 --> split 1 times.
4125 + 580 = 4684 --> split 1 times.
5013 + 321 = 4725 --> split 2 times.
4823 + 38 = 4509 --> split 3 times.
4554 + 125 = 4602 --> split 4 times.
4478 + 115 = 4553 --> split 1 times.
4456 + 111 = 4526 --> split 1 times.
4537 + 46 = 4539 --> split 1 times.
4519 + 140 = 4632 --> split 1 times.
4442 + 102 = 4506 --> split 1 times.
4539 + 47 = 4546 --> split 1 times.
4478 + 244 = 4684 --> split 1 times.
4407 + 250 = 4626 --> split 1 times.
4387 + 189 = 4552 --> split 1 times.
4511 + 41 = 4502 --> split 1 times.
4363 + 258 = 4600 --> split 1 times.
4522 + 39 = 4515 --> split 1 times.
4536 + 55 = 4542 --> split 1 times.
4467 + 100 = 4527 --> split 1 times.
4403 + 339 = 4711 --> split 1 times.
4539 + 455 = 4611 --> split 2 times.
4505 + 142 = 4610 --> split 1 times.
4423 + 189 = 4576 --> split 1 times.
4491 + 125 = 4570 --> split 1 times.
4511 + 121 = 4586 --> split 1 times.
4512 + 8

4479 + 94 = 4537 --> split 1 times.
4582 + 29 = 4524 --> split 1 times.
4373 + 179 = 4512 --> split 1 times.
4674 + 189 = 4643 --> split 2 times.
4482 + 138 = 4580 --> split 1 times.
4514 + 97 = 4570 --> split 1 times.
4424 + 167 = 4559 --> split 1 times.
4551 + 149 = 4510 --> split 2 times.
4454 + 149 = 4582 --> split 1 times.
4533 + 75 = 4560 --> split 1 times.
4450 + 110 = 4514 --> split 1 times.
4542 + 196 = 4692 --> split 1 times.
4709 + 50 = 4515 --> split 2 times.
4443 + 144 = 4554 --> split 1 times.
4443 + 144 = 4554 --> split 1 times.
4531 + 53 = 4548 --> split 1 times.
4344 + 214 = 4518 --> split 1 times.
4622 + 177 = 4538 --> split 2 times.
4662 + 137 = 4580 --> split 3 times.
4518 + 191 = 4661 --> split 1 times.
4488 + 302 = 4766 --> split 1 times.
4798 + 120 = 4572 --> split 2 times.
4384 + 315 = 4675 --> split 1 times.
4829 + 109 = 4577 --> split 2 times.
4300 + 302 = 4579 --> split 1 times.
4798 + 120 = 4572 --> split 2 times.
4490 + 88 = 4542 --> split 1 times.
4494 + 8

In [7]:
# Check the lenght of paragraphs

df[df.paragraphs_len.apply(lambda x: any(i > 5000 for i in x))]

Unnamed: 0,url,filename,Uni,title,date,itemType,language,publicationTitle,filepath,text,links,linkName,HTML,paragraphs,sentences,paragraphs_len,sentences_len,paragraph_sum
RAM5QEQA,https://neerlandistiek.nl/2023/01/taalkundig-r...,taalkundig-redeneren-met-chatgpt.html,Radboud Nijmegen,Taalkundig redeneren met ChatGPT,2023-01-14 10:00:00,webpage,nl,,C:\Users\mr\Zotero\storage\RAM5QEQA\taalkundig...,"[Taalkundig redeneren met ChatGPT, Neerlandist...","[https://neerlandistiek.nl/, https://neerlandi...","[, , hier op Neerlandistiek, , , , , , , , , ,...",True,"[ Taalkundig redeneren met ChatGPT, Neerlandi...","[Taalkundig redeneren met ChatGPT, Neerlandist...","[33, 71, 4037, 5013, 2170, 379, 249, 144, 29, ...","[32, 70, 45, 125, 73, 301, 79, 214, 117, 364, ...",47859
C6PW5SJM,https://www.uu.nl/en/organisation/in-depth/unb...,unboxing-the-black-box-of-ai.html,Utrecht University,Unboxing the black box of AI - In depth - Utre...,2023-06-02 00:00:00,webpage,en,,C:\Users\mr\Zotero\storage\C6PW5SJM\unboxing-t...,[Unboxing the black box of AI - In depth - Utr...,"[https://www.uu.nl/staff/SRNyholm, https://www...","[Sven Nyholm\n \n \n external link, Philosophe...",True,[ Unboxing the black box of AI - In depth - Ut...,[Unboxing the black box of AI - In depth - Utr...,"[61, 4366, 4442, 4356, 5117, 2762, 2090, 35, 7...","[60, 66, 155, 87, 327, 171, 259, 94, 152, 266,...",44591
JCB8V5T7,https://www.uu.nl/en/organisation/faculty-of-g...,research-posters.html,Utrecht University,Research Posters - Faculty of Geosciences - Ut...,2023-07-17 00:00:00,webpage,en,,C:\Users\mr\Zotero\storage\JCB8V5T7\research-p...,[Research Posters - Faculty of Geosciences - U...,"[#1, #2, #3, #4, None, http://posters.geo.uu.n...","[Department Earth Sciences , Department Physic...",True,[ Research Posters - Faculty of Geosciences - ...,[Research Posters - Faculty of Geosciences - U...,"[63, 4431, 4762, 4585, 4942, 4592, 4659, 4798,...","[62, 420, 243, 215, 212, 157, 253, 165, 466, 2...",515421


In [8]:

# just checkin if the long paragraphs are necessary...
df=df[df.title!='Research Posters - Faculty of Geosciences - Utrecht University']

#df=df[df["paragraph_sum"]<100000]


pd.set_option('display.max_colwidth', 255)

df[df["paragraph_sum"]>100000].filepath

VRYGVYFT                   C:\Users\mr\Zotero\storage\VRYGVYFT\20230607-jaarverslag-2022-cooperatie-surf-u.a.-gewaarmerkt.pdf
KB9PEGPE                                                 C:\Users\mr\Zotero\storage\KB9PEGPE\dpia-zoom-25-february-2022_0.pdf
WX4G68MK                                C:\Users\mr\Zotero\storage\WX4G68MK\hosa-domainarchitecture-iam-v1.0-eng-gb-final.pdf
ALDWBD7G                                   C:\Users\mr\Zotero\storage\ALDWBD7G\promises-of-ai-in-education-june-2022-def2.pdf
8AWPG27Z                                          C:\Users\mr\Zotero\storage\8AWPG27Z\sf_trendrapport_nl_v4-gecomprimeerd.pdf
WE6Q67A7                                               C:\Users\mr\Zotero\storage\WE6Q67A7\sf_trendrapport_v10_compressed.pdf
KBLFTTUZ    C:\Users\mr\Zotero\storage\KBLFTTUZ\surf-rapport-herijking-van-de-visie-op-de-digitale-leeromgeving_webversie.pdf
Name: filepath, dtype: object

In [9]:
df.to_json("2023-07-30_Uni_Files_NLP_splitter.json")


In [None]:
df=pd.read_json("2023-07-30_Uni_Files_NLP_splitter.json")

In [10]:
df.columns

Index(['url', 'filename', 'Uni', 'title', 'date', 'itemType', 'language',
       'publicationTitle', 'filepath', 'text', 'links', 'linkName', 'HTML',
       'paragraphs', 'sentences', 'paragraphs_len', 'sentences_len',
       'paragraph_sum'],
      dtype='object')

# Explode --> by sentence or by paragraph?

In [None]:
#df=df.explode("sentences").drop_duplicates(subset="sentences").reset_index(drop=True)

df.paragraphs.explode()

In [None]:
df.language.value_counts()

### I will filter in the analysis file. Therefore this code is not needed anymore...

ChatGPT_Terms="ChatGPT, Chat-GPT, GPT3, GPT-3, GPT-x, GPT-4, GPT4,\
Transformer, OpenAI, AI, hallucination, Text generation, LLM, GPT, Chatbot, Models, generative, Intelligence, Model"

import MyLib.nlp as nlp

df["AI_paragraphs"]=df.paragraphs.dropna().apply(nlp.filter_paragraphs,by=ChatGPT_Terms).dropna()

df["AI_Paragraphs_len"]=df["AI_paragraphs"].apply(lambda x: [len(i) for i in x])
df[df.AI_Paragraphs_len.apply(lambda x: any(i > 4999 for i in x))]


In [11]:
column="sentences"

#df=df[df.file.apply(lambda x: x.endswith("html"))]

df=df.explode(column).reset_index(drop=True)
df=df[df[column].apply(lambda x: type(x)==str)]
df=df.drop_duplicates(column)
df["LEN"]=df[column].apply(lambda x: len(x.split(" ")))
print(len(df))

39930


In [None]:
df.text[:2]

In [None]:
#df[df.sentences.apply(lambda x: len(x.replace('[^a-zA-Z]', ''))<10)]["sentences"]

In [13]:
#remove tables of content
df=df[df.sentences.apply(lambda x: x.count(".")<10)]

#remove non-sense sentences by: if sentence is shorter than 10 characters without numbers.
df=df[df.sentences.apply(lambda x: len(x.replace('[^a-zA-Z]', ''))>10)]

df=df[df.sentences.apply(len)>15]



In [None]:
pd.set_option('display.max_colwidth', None)
PDF=df[df.HTML==False]

# test what the pdf sentences with dots look like now.

PDF[PDF.sentences.apply(lambda x: x.count(".")>8)].sentences.head(5)

In [14]:
# Check: remove short strings that dont resemble sentences.

n=16 # ab 17
c=df[df.sentences.apply(len)<n]
print("LEN: ", len(c))      
[print(i) for i in df[df.sentences.apply(len)<n].sentences]

LEN:  0


[]

In [None]:
df[df.sentences.apply(len)>4500]

In [None]:
df.columns

# RUN THE NLP PIPELINE

In [15]:
# test

test=pd.DataFrame(df.head(5).to_dict())
test["t"]=test.url
print(pd. __version__)

test=nlp.NLP_Pipeline(test, text_column=column, target_language="en",sentiment=False)
test["source_language"]

2.0.3
Current Time = 13:02:09
Lenght:  5
cleaning done.
Current Time = 13:02:09
next: language.
language detection done.
Current Time = 13:02:09
Next: Translating...
Current Time = 13:02:09
pure english text done. Next: Token & Lemmatizing.
Current Time = 13:02:09
Token & Lemmatizing done. Next: Remove Stopwords.
Current Time = 13:02:14


0    en
1    en
2    en
3    en
4    en
Name: source_language, dtype: object

In [None]:
test.head(20).sentences

In [16]:
## takes approximately 120 minutes for translation (last time)
df=nlp.NLP_Pipeline(df, text_column=column, target_language="en",sentiment=False)

Current Time = 13:02:26
Lenght:  35755
cleaning done.
Current Time = 13:02:27
next: language.
language detection done.
Current Time = 13:04:45
Next: Translating...


NOLA. The . It m. Teac. We t. The . At s. Teac. The . Ther. The . The . The . This. In d. What. In S. A no. The . And . Aren. AI c. AI i. So w. The . Ther. Also. Ther. What. We h. Is t. How . Co-c. In o. How . Whic. What. Do w. To w. We k. What. It u. Do y. Sign. Cont. Addr. We u. Do y. Nati. NOLA. Ling. Neer. “Tha. Seve. Appa. Alth. But . I pu. Part. But . Char. For . The . It's. What. I ca. What. The . It i. And . But . You'. And . Seve. Let'. That. It p. But . But . The . The . But . What. Acco. And . Okay. Anyw. A co. The . If o. Beca. Sorr. Foll. My c. The . The . In t. With. If y. Stud. It h. To s. It i. But . And . I've. For . I wi. Shar. The . Sinc. Lect. 24 M. Dece. Sinc. Chat. 24 M. What. Did . Hone. I wa. And . If y. I be. I gu. But . I've. It c. Try . You . It m. It i. You . We'v. I wa. Ther. Ther. “The. Benu. So I. I do. From. The . Ther. I've. Yes,. Ther. I do. Tryi. Coin. Comp. Undo. Beca. And . The . In s. Yes,. Your. I su. This. Do n. Let . I im. With. For . The . The 

Fear. Hals. Reme. If t. ’ Hu. Stud. Agai. Or, . Hals. So t. Of w. ’ In. Fabi. I di. Conc. Arti. Read. | Te. What. Open. What. Rese. By l. Soci. Thes. The . Thin. At t. More. ICT . In t. In a. They. The . The . Lect. prof. Open. Info. Lega. They. Swit. But . An e. Oppo. This. By i. They. By l. Empl. Info. Regi. Know. In d. But . The . Walk. 00 h. Wort. 30 h. Cogn. Tim . Chat. 25 h. Stud. Ster. Retr. 8 pm. What. Walk. 00 h. Chec. Plea. Symp. Keyn. Wort. Para. Chat. Para. Retr. Keyn. What. Inau. Regi. Chat. H. J. With. This. The . To d. But . Chat. Neve. For . I kn. An o. Howe. • Th. Supp. When. You . H. J. This. • Ch. You . You . • No. On C. • Wh. With. Espe. • Wi. Cogn. Chat. H. J. But . The . Shor. Chat. The . The . But . Unfo. And . Belo. What. The . • A . It i. Howe. H. J. Chat. And . Chec. It's. Then. Howe. • As. This. This. The . In s. Just. So w. Belo. Thin. Or m. Chat. But . • Su. Chat. You . It c. But . H. J. Usua. Stil. You . And . Anot. • A . You . Exac. This. dysl. Chat. Then

” Zo. “Fru. This. An o. The . “Bra. )”, . “Too. Rese. Drea. We c. Poor. So t. And . The . ” Al. Ther. Ther. Inte. The . The . Chat. Chat. Toni. PhD . Brid. This. Kore. “For. ” Le. New . “Cha. You . Howe. Cryp. Lee'. Trus. “At . But . Beca. ” On. “Onl. But . “Int. ” Le. “Thi. I th. The . 00 i. Afte. On W. At E. We i. For . Duri. The . This. This. Mini. No p. Post. The . They. As a. Do y. WHAT. SIGN. Betw. Betw. Curs. That. The . Mayb. User. They. User. What. Rose. User. I wa. Now . What. Rose. User. Rose. User. User. Wisd. The . And . > He. > Ho. What. > Ho. > Wh. Unle. > Do. Sudd. How . Thos. And . Han . Han . 08/0. A fe. Stev. My f. 06/0. Ther. I fe. In a. No l. 22/0. Grou. I se. The . 15/1. Curs. Next. The . Also. nl w. The . Mean. Mean. One . They. So l. The . But . If y. We l. We s. Will. We s. This. As t. Furt. To g. Chat. For . 5 ba. Two . The . This. Last. AI -. It h. Yes,. We h. 16/0. One . Now,. From. Caps. Caps. The . As a. phot. Minh. That. It's. Mirr. “Tec. She . “I a. Ther

The . This. In a. Data. Anot. ‘We . The . It l. “If . Data. Digi. Alla. For . ‘The. 000 . ‘How. This. One . For . This. Some. We w. The . Toge. ‘We . The . Subs. Arti. This. “The. “It . The . ’ Th. The . ‘The. We s. Resp. When. At t. Than. Cros. The . Her . ‘We . They. The . If t. We t. Beum. ‘We . When. Howe. We a. Shou. Link. Anot. Thes. Beum. We a. When. So t. Pers. 'The. ‘It . Peop. ’ Be. ‘In . We u. For . It i. Alla. Unti. If y. Asso. Arti. 'In . Then. We w. Beca. xCur. Crea. By b. ‘We . Ther. This. In a. “The. ’ Fo. “We . This. The . Lect. Lect. It's. #lin. #dig. Dr. . Dr. . by l. http. Cha . http. Cha . http. http. http. http. Text. Bett. Spec. http. prof. http. Mode. http. ● Re. ● Ov. ● Me. ● Su. ask1. ●~ 1. ● Us. ● It. ● Im. ● Le. http. http. http. http. d/em. Some. Unde. ● Ha. … to. A tr. We v. We c. Curi. Curi. Time. http. or c. Facu. Scie. In t. nl a. The . Expe. nl s. ..... nl... Musi. Here. Whic. Lies. A gr. The . The . At t. But . De V. One . Amer. Acco. Het . Vari. nl a

they. The . part. with. huma. Surv. Euro. XR E. The . peop. othe. shar. crea. “An . to e. Euro. to q. with. And . we a. Euro. Thom. The . It's. litt. “It . get . inte. own . That. facu. That. Acco. is c. “We . Deve. They. to b. We o. unfo. educ. ther. inst. “On . the . HAN . soci. ask . at t. Conv. New . Also. (HAN. each. it w. work. save. That. plac. Univ. top . univ. brin. retr. scal. It's. “It'. trai. issu. Advi. (SUR. to f. low-. “The. gica. to s. Also. and . Mana. The . the . enou. to b. time. “You. That. we f. XR D. At t. also. And . on t. ques. crea. “Bot. earn. atte. SURF. Coll. Belo. The . peop. virt. The . educ. The . a te. lear. Cont. The . Digi. comm. supp. • Th. XR i. abou. mode. in e. ethi. ESTI. of e. dent. (SUR. the . Acco. area. User. When. a wo. inst. we r. whil. gove. ” A . Big . are . A ni. “Sim. usin. comp. next. the . Very. quit. This. but . Drow. Your. Wher. trea. “We . beca. How . you . You . agre. regu. of t. “Whe. come. they. dete. In d. shar. can . Fort. Big 

In t. risk. Thes. miti. 22/8. The . This. decl. inno. prob. conf. Exte. SURF. .. I. coop. With. memb. C. S. The . Port. eval. regu. star. D. S. the . cons. with. ease. Moni. occu. ad E. work. This. The . Inci. 23/8. rate. gap . Alth. that. vaca. This. serv. tigh. SURF. Of c. corr. In 2. In a. To t. (Maz. cost. In a. (inc. for . The . anal. to t. orga. of w. The . exis. All . made. some. meas. gesi. Frau. Frau. as a. fina. resp. to t. prev. (rep. frau. This. dete. expl. will. coun. More. frau. 24/8. Cont. the . work. (the. The . proc. Only. paym. prev. Thes. obli. fina. SURF. inte. meas. In a. good. A li. SURF. data. In a. SURF. For . of t. comm. Esta. Frau. part. only. 2023. The . Natu. Ther. occu. The . inco. Agre. serv. Inco. memb. ther. Give. SURF. with. With. fina. deri. 25/8. sell. SURF. ensu. legi. This. 1) t. comp. 2) t. cont. - Su. SURF. leve. midd. The . serv. The . of t. SURF. cont. be c. The . safe. majo. For . acti. The . 84% . In m. unde. curi. trus. term. help. In 2. with

coup. The . four. In a. fibe. This. of s. 60/8. cont. The . amou. .. •. A mu. The . The . the . The . 15 y. acqu. user. char. In t. year. SURF. have. Ther. beca. In p. ther. In t. Arnh. The . forw. In 2. agai. This. the . cont. Vari. Ther. conc. The . ends. Expl. The . Subs. Subs. EU g. 62/8. EUR . Educ. With. Acce. gear. Supp. supp. Prog. the . EPIC. high. FAQ . ... . SURF. The . Acce. and . The . diss. The . inte. orga. we a. In J. The . High. • In. the . • In. van . stee. • We. crea. • Bl. To t. To h. tips. • Th. • Th. of t. 63/8. This. with. All . set . can . This. it w. The . of t. From. Of t. 17.5. The . In t. assu. The . pass. inno. Netw. The . Tota. Less. .. T. on n. Lice. The . Cont. Medi. Serv. .. T. Less. Turn. Less. 65/8. Turn. Less. Coll. The . Univ. High. Seco. Non-. To p. The . Cost. The . EUR . Serv. Subs. Othe. Resu. Pers. The . Wage. Soci. Pens. 39,1. Outs. Othe. Rele. ... . At t. This. Mana. Team. .. T. Over. The . Hous. Offi. Auto. Cost. Conf. Othe. Depr. Inta. Conc

addi. mati. and . mode. poss. mode. the . prob. Non-. thes. incr. beco. Ther. that. thre. expe. pre-. will. Deep. Flam. deve. It c. lang. rega. Flam. a fe. It's. swit. vide. This. in c. educ. Mult. The . extr. labe. is p. work. lear. We c. comb. know. ring. mult. open. Phys. For . laws. of t. redu. That. auto. for . trai. AI S. As t. syst. impl. equa. data. and . By e. cons. larg. econ. mode. know. in a. A re. emer. netw. prom. clas. fewe. This. insp. redu. and . shif. to t. The . cert. deve. open. skil. More. of A. maki. have. to A. In a. the . adva. and . new . poss. hard. Grap. GNN . data. The . The . but . send. natu. Towa. With. repr. to t. deep. Sinc. many. effi. open. Fede. In a. heal. cruc. to h. This. netw. part. mode. have. data. Cent. is n. comp. exce. wrap. from. This. thro. lear. eith. data. and . to a. is r. The . trai. mode. to g. less. This. orga. of d. the . skil. #Pri. econ. #Aut. This. oppo. so f. make. data. heal. By f. rese. lead. risk. data. Zodr. and . Synt. prot

The . Qubi. pron. Deco. To m. In q. quan. so t. This. reve. dete. Curr. Quan. tech. The . To p. dram. With. To m. Ther. For . The . open. “The. [You. 129,. #Con. Most. bill. Only. resi. How . erro. This. when. Not . Thes. The . with. Duri. a nu. We a. beco. grad. and . Vent. Larg. Comp. curi. skil. #Dat. Quan. micr. Quan. euro. The . If a. QAL . deve. cont. opti. QAL . QAL . of p. More. with. Incr. open. Capg. to b. This. from. The . By p. open. SMEs. Quan. More. Aria. Cons. Rela. Koen. Serv. →A f. →Vir. →New. →An . XR, . tech. is u. othe. XR c. Educ. Whet. to b. find. The . ke-v. This. inte. comp. In t. Web3. The . Larg. appe. inst. Inte. Soci. fend. Educ. ment. cons. Rese. expe. Stud. Lear. This. Huma. XR m. or t. New . Tech. Tech. logi. Beco. very. At M. They. whil. The . open. #Dig. #Con. With. an X. With. Rese. Soci. Gene. of a. By c. XR c. Poin. It c. Remo. reli. With. powe. Some. open. XR p. virt. to c. In t. One-. This. For . inte. Virt. Ten . coll. Meta. skil. Huma. #Rig. #Ope

high. tain. The . We h. faci. stud. With. keep. FLEX. Sinc. univ. Ther. repl. to d. of t. Flex. it a. It's. syst. One . beca. All . ther. An e. whic. The . butt. In t. in s. One . what. how . and . stan. Fore. If f. majo. poin. A co. betw. the . lear. mayb. lear. mult. lear. It m. chai. link. from. face. lari. flex. dist. face. test. sati. Ther. graf. can . Zone. Acce. Pilo. REVI. 3LIT. The . lear. ICT . emer. cent. educ. expe. incr. deca. Stim. for . But . usin. beca. And . the . The . Ever. In d. ask . The . for . spec. a co. The . new . And . peri. stud. This. coop. In t. to i. And . old . depa. educ. has . foun. The . last. supp. beco. hope. of I. “One. this. lead. pref. good. can . depe. the . Mari. REVI. 4MIC. Micr. have. They. to m. A st. acqu. also. Micr. rubr. It m. must. Whet. cont. a di. cert. some. The . seem. badg. As a. (for. The . badg. this. open. Ther. at h. The . stil. Micr. The . At t. comp. Cont. will. poss. Such. of e. REVI. 5HYB. EDUC. Blen. mix . 'ble. subs. of l

This. lear. educ. educ. REVI. 2Exp. Cybe. nic . and . it i. That. have. ning. also. as a. on p. the . part. Nice. tric. more. educ. agai. It l. info. addr. This. the . publ. knoc. educ. can . thre. Sour. 'Hig. 3PHA. What. rega. ings. prun. must. And . In t. of n. If a. can . Just. an o. With. held. And . data. Idea. out . imag. phas. Stor. a ch. A se. kept. This. let . Info. For . by t. syst. usef. indi. REVI. 45. . This. purc. For . tati. This. pers. for . are . Thin. went. With. shou. Sett. arch. usin. impo. to m. It i. to c. As a. The . The . Data. this. That. info. tric. The . info. litt. Howe. educ. almo. gene. Cons. for . The . REVI. REVI. TO C. OR R. For . are . aspe. draw. In t. two . them. of G. It p. wher. of n. a li. Inno. a po. The . usab. Peak. enth. Thes. know. Thro. high. Expe. peri. REVI. Slop. clea. With. Ther. Plat. more. A pe. grow. Swam. slow. In a. obso. This. show. reco. On t. will. THE . Purp. The . posi. are . This. a co. lear. • 28. • Em. • Hy. • St. Disc. Use 

Thin. So i. Aren. Cont. Auth. Cont. Tips. We f. Do y. Hybr. Saxi. In E. The . But . Chat. With. Open. Read. This. Teac. Inte. What. Succ. At l. In t. (lin. Podc. Blen. Some. In a. They. How . He l. “Stu. A po. Thes. Thes. They. The . Many. A lo. But . From. Whic. Char. What. How . Hard. How . How . Impl. More. But . How . Do s. In t. Stre. Nowa. A so. But . Chat. You . Is t. Read. The . Are . Watc. Othe. You . Ther. Ever. We f. Be i. Avat. At H. HAN . The . How . Afte. Bann. Rath. This. “ Ev. We'd. As l. TU D. The . TU D. The . Nice. Do y. The . Judi. “You. This. Thes. #1 H. A po. In t. She . Current Time = 15:05:30
pure english text done. Next: Token & Lemmatizing.
Current Time = 15:05:31
Token & Lemmatizing done. Next: Remove Stopwords.
Current Time = 15:10:01


In [None]:
df.columns

In [17]:
df.to_json("2023-07-27_ChatGPT_Sentences_NLP-Out.json")

In [None]:
df.columns

df2=df[['Uni','FileKey','url','date', 'Title','linkName', 'linkUrl', 'AI_paragraphs','text_clean', 'letters_count', 'word_count',
       'language', 'source_language', 'pure_text', 'Lemmata', 'NoStopwords']]

In [None]:
df2.to_json("2023-06-06_Zotero_AI_nlp_en2.json")