In [1]:
#loading python require packages
from string import punctuation
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from IPython.display import display
from nltk.stem import PorterStemmer
import pandas as pd
import os

In [2]:
#use to remove stop words from sentences
stop_words = set(stopwords.words('english'))
#apply lemmatizer
lemmatizer = WordNetLemmatizer()
#apply stemming on words
ps = PorterStemmer()

In [3]:
#function to clean documents
def cleanData(doc):
    tokens = doc.split() #split document into words
    table = str.maketrans('', '', punctuation) #remove punctuation
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()] #check only for alphabets
    tokens = [w for w in tokens if not w in stop_words] #remove stop words 
    tokens = [word for word in tokens if len(word) > 1] #apply on words whose length > 1
    tokens = [ps.stem(token) for token in tokens] #apply stemming on words
    tokens = [lemmatizer.lemmatize(token) for token in tokens] #apply lemmatization on words
    tokens = ' '.join(tokens) #join all words as single document
    return tokens #return cleaned document


In [5]:
path = 'GoogleScholar' #here is the location
clean = []
for root, dirs, directory in os.walk(path):#load all documents from GoogleScholar folder
    for j in range(len(directory)):
        document = pd.read_csv(root+"/"+directory[j]) #Read contents from each document 
        document.fillna("N/A", inplace = True)
        for i in range(len(document)): #read all rows from dataset
            titles = str(document.get_value(i, 'titles')) #read each column
            authors = str(document.get_value(i, 'authors'))
            date = str(document.get_value(i, 'date'))
            source = str(document.get_value(i, 'source'))
            desc = str(document.get_value(i, 'descriptions'))
            citations = str(document.get_value(i, 'citations'))
            if len(titles) > 10:
                titles = cleanData(titles) #clean the titles
            if len(source) > 10:    
                source = cleanData(source) #clean source
            if len(desc) > 10:
                desc = cleanData(desc)
            clean.append([titles, authors, date, source, desc, citations]) #add clean data to array    
        print("Cleaned Document Name : "+directory[j]+" "+str(document.shape)+" "+str(len(clean)))          
#create data frame with all cleaned documents    
df = pd.DataFrame(clean, columns = ['titles','authors','date','source','descriptions','citations']) 
#save and display cleaned document
df.to_csv("Cleaned.csv", index=False)   
print()
print("Cleaned Document")
print()
display(df)        

  
  if __name__ == "__main__":
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == "":
  del sys.path[0]


Cleaned Document Name : Abolfazl Safikhani.csv (44, 6) 44
Cleaned Document Name : Aditya Johri.csv (209, 6) 253
Cleaned Document Name : Ahmed Bin Zaman.csv (13, 6) 266
Cleaned Document Name : Alexander Brodsky.csv (199, 6) 465
Cleaned Document Name : Alexander Levis.csv (376, 6) 841
Cleaned Document Name : Ali Beheshti.csv (41, 6) 882
Cleaned Document Name : Amarda Shehu.csv (253, 6) 1135
Cleaned Document Name : Ariela Sofer.csv (52, 6) 1187
Cleaned Document Name : Ben Seiyon Lee.csv (17, 6) 1204
Cleaned Document Name : Bernd-Peter Paris.csv (48, 6) 1252
Cleaned Document Name : Bijan Jabbari.csv (227, 6) 1479
Cleaned Document Name : Bo Han.csv (230, 6) 1709
Cleaned Document Name : Brian L. Mark.csv (183, 6) 1892
Cleaned Document Name : Burak Tanyu.csv (82, 6) 1974
Cleaned Document Name : Cameron Nowzari.csv (65, 6) 2039
Cleaned Document Name : Carlotta Domeniconi.csv (239, 6) 2278
Cleaned Document Name : Caroline D Hoemann.csv (140, 6) 2418
Cleaned Document Name : Colin Reagle.csv (10,

Unnamed: 0,titles,authors,date,source,descriptions,citations
0,joint structur break detect paramet estim high...,Abolfazl Safikhani,1/2/2022,journal journal american statist associ,assum stationar unrealist mani time seri appli...,50.0
1,spatiotempor model yellow taxi demand new york...,Abolfazl Safikhani,7/1/2020,journal intern journal forecast,the spatiotempor variat demand transport parti...,37.0
2,investig rang anxieti safeti buffer batteri el...,Abolfazl Safikhani,6/13/2018,journal journal advanc transport,driver tend rang anxieti compar drive tradit f...,34.0
3,cyclelength predict actuat trafficsign control...,Abolfazl Safikhani,3/1/2018,journal journal comput civil engin,In urban transport system traffic signal main ...,32.0
4,predict shortterm uber demand new york citi us...,Abolfazl Safikhani,5/1/2019,journal journal comput civil engin,the demand ehail servic grow rapidli especi la...,17.0
...,...,...,...,...,...,...
12332,proceed workshop structur unstructur knowledg ...,Ziyu Yao,2022/7,confer proceed workshop structur unstructur kn...,,
12333,code edit few exemplar adapt multiext composit,Ziyu Yao,3/4/2022,confer deep learn code workshop,thi paper consid comput sourc code edit exempl...,
12334,proceed workshop natur languag process program,Ziyu Yao,2021/8,confer proceed workshop natur languag process ...,the prolifer programmingrel platform github st...,
12335,On advanc natur languag interfac data collect ...,Ziyu Yao,2021,institut the ohio state univers,natur languag provid univers effici way human ...,
