# Pipeline

# Packages

In [None]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import nltk
%matplotlib inline

# Load data

In [None]:
with open("DBLP_Subset.txt") as f:
  data = f.read()
na_authors, na_years, na_pubs, na_index, na_refs, na_abstracts = [0]*6
docs = data.split("#*")[1:]
for doc in docs:
  if "#@" not in doc:
    na_authors +=1
  if "#t" not in doc:
    na_years+=1
  if "#c" not in doc:
    na_pubs+=1
  if "#index" not in doc:
    na_index+=1
  if "#%" not in doc:
    na_refs+=1
  if "#!" not in doc:
    na_abstracts+=1
print("The number of documents: ", len(docs), "\n")
print("The number of missing authors: ", na_authors, "\n")
print("The number of missing years: ", na_years, "\n")
print("The number of missing publications: ", na_pubs, "\n")
print("The number of missing index: ", na_index, "\n")
print("The number of missing references: ", na_refs, "\n")
print("The number of missing abstracts: ", na_abstracts, "\n")

The number of documents:  37963 

The number of missing authors:  0 

The number of missing years:  0 

The number of missing publications:  0 

The number of missing index:  0 

The number of missing references:  19217 

The number of missing abstracts:  17406 



In [None]:
num_doc = 0
docs_dict = {}
for doc in docs:

  title = ""
  authors = ""
  year = 0
  publication_revue = ""
  index = 0
  references = np.nan
  abstracts = np.nan

  title = doc[0:doc.find("#@")].replace("\n", "")
  authors = doc[doc.find("#@")+2:doc.find("#t")].split("\n")
  authors = ", ".join(authors)
  year = int(doc[doc.find("#t")+2:doc.find("#c")].replace("\n", ""))
  publication_revue = doc[doc.find("#c")+2:doc.find("#index")].replace("\n","")
  if doc[doc.find("#index")+6:].find("#%") == -1 and doc[doc.find("#index")+6:].find("#!") == 1:
    # no abstract and no references
    index = int(doc[doc.find("#index")+6:].replace("\n",""))
  elif doc[doc.find("#index")+6:].find("#%") != -1:
    # only references
    index = int(doc[doc.find("#index")+6:doc.find("#%")].replace("\n","")) 
  elif  doc[doc.find("#index")+6:].find("#!") != -1:
    # only abstracts
    index = int(doc[doc.find("#index")+6:doc.find("#!")].replace("\n","")) 
  if "#%" in doc:
    references = doc[doc.find("#%")+2:doc.find("#!")].replace("\n","") # return -1 if no #!(abstract)
    references = references.split("#%")
    references = ", ".join(references)
  if "#!" in doc:
    abstracts = doc[doc.find("#!")+2:].replace("\n", "")
  docs_dict[num_doc] = [title, authors, year, publication_revue, index, references, abstracts]
  num_doc +=1

df = pd.DataFrame(docs_dict)
df = df.T
df.columns = ["Title", "Authors", "Year", "Publication", "Index", "References", "Abstract"] 
df

Unnamed: 0,Title,Authors,Year,Publication,Index,References,Abstract
0,Improved Channel Routing by Via Minimization a...,"Chung-Kuan Cheng, David N. Deutsch,",1988,DAC,131751,"133716, 133521, 134343",Channel routing area improvement by means of v...
1,A fast simultaneous input vector generation an...,"Lei Cheng, Liang Deng, Deming Chen, Martin D. ...",2006,DAC,131752,"132550, 530568, 436486, 134259, 283007, 134422...",Input vector control (IVC) technique is based ...
2,On the Over-Specification Problem in Sequentia...,"Kwang-Ting Cheng, Hi-Keung Tony Ma,",1992,DAC,131756,"455537, 1078626, 131745",The authors show that some ATPG (automatic tes...
3,Device and architecture co-optimization for FP...,"Lerong Cheng, Phoebe Wong, Fei Li, Yan Lin, Le...",2005,DAC,131759,"214244, 215701, 214503, 282575, 214411, 214505...",Device optimization considering supply voltage...
4,Differential Fault Simulation - a Fast Method ...,"Wu-Tung Cheng, Meng-Lin Yu,",1989,DAC,131760,"131744, 806030",A new fast fault simulator called differential...
...,...,...,...,...,...,...,...
37958,Selection of strategies in judgment-based effo...,"Magne Jørgensen,",2010,Journal of Systems and Software,1600529,"996865, 601059, 361510, 492786, 997634, 112851...",We currently know little about the factors tha...
37959,Performance analysis of opportunistic broadcas...,"Abbas Nayebi, Hamid Sarbazi-Azad, Gunnar Karls...",2010,Journal of Systems and Software,1600531,"505564, 1114157, 412964, 588689, 53668, 692884...",This paper investigates a class of mobile wire...
37960,Quality adaptive end-to-end packet scheduling ...,"Rossella Fortuna, Luigi Alfredo Grieco, Gennar...",2010,Journal of Systems and Software,1600532,"396959, 794228, 588835, 997951",In Internet multimedia streaming the quality o...
37961,Design and evaluation of a novel MAC layer han...,"Richard Werner Nelem Pazzi, Zhenxia Zhang, Azz...",2010,Journal of Systems and Software,1600537,"666821, 784037, 506991, 505779, 1247751",In recent years the IEEE 802.11 wireless netwo...


In [None]:
df.isna().sum()

Title              0
Authors            0
Year               0
Publication        0
Index              0
References     19217
Abstract       17406
dtype: int64

# Descriptive analysis before preprocessing

In [None]:
df.describe()

Unnamed: 0,Title,Authors,Year,Publication,Index,References,Abstract
count,37963,37963,37963,37963,37963,18746.0,20557
unique,36925,32658,75,284,21199,2128.0,20525
top,Preface.,",",2009,J. Symb. Log.,0,,This paper presents the findings of a five-yea...
freq,123,414,4866,3402,16765,16579.0,3


## Abstracts description

In [None]:
# Word cloud
corpus = df.copy().iloc[:,6]
wordcloud = WordCloud(background_color="white")
text = " ".join(corpus) 
wordcloud.generate(text)
px.imshow(wordcloud)

In [None]:
# Frequency of each terms per documents
corpus = df.copy().iloc[:,6][:10]
texts = ''
vectorizer = CountVectorizer()
doc_vec = vectorizer.fit_transform(corpus)
words = doc_vec.toarray()	
df_freq = pd.DataFrame(words.transpose(),index=vectorizer.get_feature_names())
px.imshow(df_freq.transpose(),title="Frequency per document")

In [None]:
corpus = df.copy().iloc[0,:]
for text in corpus:
		texts = texts+' '+ str(text)
			
corp = texts.split()
freq = nltk.FreqDist(corp)
dict_freq = {}
for key, value in freq.items():
  dict_freq[key] = [key, value]

data_frame_freq = pd.DataFrame(dict_freq)
data_frame_freq = data_frame_freq.transpose()
data_frame_freq.columns = ['word','count']

px.bar(data_frame_freq, x='word', y='count', title='Word frequency on corpus', 
template='plotly_white', labels={'word':'Words', 'count':'Count'})

## Year description

In [None]:
px.histogram(data_frame=df, x="Year",title="Number of publication per Year")