# Library Import

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import ast
from collections import Counter
import spacy
from textblob import TextBlob
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
import string

# Data Import

Dataset URL: https://www.aminer.org/citation

In [44]:
data = pd.read_csv('DBLP_citation.csv')

In [45]:
data.head(3)

Unnamed: 0,title,authors,year,venue,id,references,abstract,citation_count
0,OQL[C++]: Extending C++ with an Object Query C...,['José A. Blakeley'],1995,Modern Database Systems,0,[],,5
1,Transaction Management in Multidatabase Systems.,"['Yuri Breitbart', 'Hector Garcia-Molina', 'Ab...",1995,Modern Database Systems,1,[],,0
2,Overview of the ADDS System.,"['Yuri Breitbart', 'Tom C. Reyes']",1995,Modern Database Systems,2,[],,0


In [46]:
# Function to convert string-lists to Python lists
def convert_to_list(data):
    try:
        return ast.literal_eval(data)
    except:
        return []  # Returns an empty list if there's an error in conversion

# Convert the string representations to lists
data['authors'] = data['authors'].apply(convert_to_list)
data['references'] = data['references'].apply(convert_to_list)

# Convert id to string
data['id'] = data['id'].astype(str)

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632442 entries, 0 to 1632441
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   title           1632442 non-null  object
 1   authors         1632442 non-null  object
 2   year            1632442 non-null  int64 
 3   venue           1630753 non-null  object
 4   id              1632442 non-null  object
 5   references      1632442 non-null  object
 6   abstract        653506 non-null   object
 7   citation_count  1632442 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 99.6+ MB


# Preprocessing

## Missing values

In [48]:
data.isna().sum()

title                  0
authors                0
year                   0
venue               1689
id                     0
references             0
abstract          978936
citation_count         0
dtype: int64

In [49]:
preprocessed_data = data.copy()
preprocessed_data['abstract'] = preprocessed_data['abstract'].fillna('')
preprocessed_data['venue'] = preprocessed_data['venue'].fillna('')
preprocessed_data.isna().sum()

title             0
authors           0
year              0
venue             0
id                0
references        0
abstract          0
citation_count    0
dtype: int64

## Create New Feature

In [50]:
preprocessed_data['AUTHOR_COUNT'] = preprocessed_data['authors'].apply(lambda x: len(x))
preprocessed_data['REF_COUNT'] = preprocessed_data['references'].apply(lambda x: len(x))

## Rename the column

In [51]:
preprocessed_data.rename(columns={'citation_count': 'ISCITED_COUNT'}, inplace=True)

## Delete Invalid Rows

In [52]:
preprocessed_data['year'].describe()

count    1.632442e+06
mean     1.801409e+03
std      6.012202e+02
min     -1.000000e+00
25%      1.995000e+03
50%      2.003000e+03
75%      2.007000e+03
max      2.011000e+03
Name: year, dtype: float64

In [53]:
preprocessed_data = preprocessed_data[preprocessed_data['year'] >= 1800] # Remove the rows with year = -1

In [54]:
preprocessed_data.describe()

Unnamed: 0,year,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT
count,1469015.0,1469015.0,1469015.0,1469015.0
mean,2001.926,1.580486,2.612094,1.583045
std,7.600157,11.03237,1.54726,4.705101
min,1936.0,0.0,1.0,0.0
25%,1999.0,0.0,2.0,0.0
50%,2004.0,0.0,2.0,0.0
75%,2007.0,0.0,3.0,0.0
max,2011.0,4474.0,114.0,617.0


## Check Language

In [55]:
non_eng_chars = set(''.join(preprocessed_data['title'].str.replace(r'[a-zA-Z\s]+', '')))

print("Non-English characters used in the 'title' column:", non_eng_chars)

Non-English characters used in the 'title' column: {'č', '®', '½', '扩', 'æ', 'c', '>', ']', '·', 'T', 'ﬁ', '…', '"', '_', '制', '超', 'j', 'Ö', '中', '下', 'U', 'd', 'm', '*', '及', '方', '“', 'I', 'Δ', '1', 'ß', '展', '服', 'v', '析', '：', '8', "'", '(', '`', 'à', '关', '本', '5', 'Ê', '发', 'í', '药', 'ℓ', '匹', 'V', '@', '\x93', '<', 'è', '\\', '配', 'Ç', '系', 'ç', '&', 'ä', '³', '¹', '体', '\x97', '逻', '\x92', 'o', '¾', 'a', '络', 'µ', 'N', 'Y', 'Z', 'L', 'ê', '组', 'μ', '于', 'D', 'A', 'R', 'l', '}', '面', 'J', '-', '≠', '基', '研', 'Ó', '合', '资', 'Í', '述', 'ô', 'λ', 'ú', 'e', '\x91', '演', '«', 'Ï', 'î', 'S', 'Ô', 'Û', 'C', '学', '取', 'Ò', '护', '\x80', 'Å', '/', 'Ì', 'M', 'k', 'r', ';', '识', 'q', '{', '´', ' ', 'F', '源', '户', 'x', '过', '医', '–', '究', 'ć', '协', '现', '流', '©', '|', '理', '定', '\x9c', '义', '±', '程', '‘', 'Ü', 'g', '?', '4', 'Ω', 'W', 'π', 'å', '号', 'Q', 'Ä', 'u', 'ª', '•', '—', '息', '\x99', 'b', 'Î', '~', 'ö', '容', 'ε', 'ý', '习', '点', '建', 'é', '#', 'È', '¬', '”', 'Á', '辑', '用', 'â', '的', '

In [56]:
english_chars = set(string.ascii_letters + string.punctuation + ' ')

# Function to check if all characters in a title are English
def is_english(col):
    return all(char in english_chars for char in col)

# Filter the DataFrame to keep only rows with English titles
english_only_data = preprocessed_data[preprocessed_data['title'].apply(is_english)]
english_only_data = english_only_data[english_only_data['abstract'].apply(is_english)]
english_only_data = english_only_data[english_only_data['venue'].apply(is_english)]

## Check Punctuation

In [57]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(english_only_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['>', ']', '"', '_', '}', '~', '-', '*', '+', '#', ',', '!', "'", '`', '(', '^', '/', ')', ';', '{', '$', '@', '<', '\\', '|', ':', '[', '.', '%', '=', '&', '?']


In [58]:
for char in non_alphabetic_chars_list:
    titles_with_char = english_only_data[english_only_data['title'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['title'].iloc[3]) 

> Example:
Turn it <u>this</u> way: grounding collaborative action with remote gestures.
] Example:
Operating System Support [for Multimedia Databases].
" Example:
"Kage no Sekai": interactive animation of shadow based on physical action.
_ Example:
SA_MetaMatch: relevant document discovery through document metadata and indexing.
} Example:
A Lazy and Layered SMT($\mathcal{BV}$) Solver for Hard Industrial Verification Problems.
~ Example:
Segmentation of Range Images in a~Quadtree.
- Example:
Requirements for a Performance Benchmark for Object-Oriented Database Systems.
* Example:
Leveraging Distance Table to Accelerate Data Location in Large-Scaled Data Intensive Grid Environment*.
+ Example:
OZ+: An Object-Oriented Database System.
# Example:
Generic Programming for Scientific Computing in C++, Java, and C#.
, Example:
Title, Preface, Contents.
! Example:
Like Rome, a mobile operator's empire wasn't built in a day!: a journey through the rise and fall of mobile network operators.
' E

In [59]:
def clean_column(column):
    # Use raw strings for regex. Protect 'C++' and 'C#'
    column = column.str.replace(r'C\+\+', 'TEMPORARYCPP', regex=True)
    column = column.str.replace(r'C#', 'TEMPORARYCSHARP', regex=True)
    
    # Remove all non-alphabetic characters except spaces
    column = column.str.replace(r'[^a-zA-Z\s]', '', regex=True)
    
    # Restore 'C++' and 'C#'
    column = column.str.replace('TEMPORARYCPP', 'C++', regex=False)
    column = column.str.replace('TEMPORARYCSHARP', 'C#', regex=False)
    
    return column

# Clean the 'title', 'abstract', and 'venue' columns
clean_data = english_only_data.copy()
clean_data['title'] = clean_column(clean_data['title'])
clean_data['abstract'] = clean_column(clean_data['abstract'])
clean_data['venue'] = clean_column(clean_data['venue'])

In [60]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['+', '#']


In [61]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['title'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['title']) 

+ Example:
0          OQLC++ Extending C++ with an Object Query Capa...
7          An ObjectOriented DBMS War Story Developing a ...
18                        C++ Bindings to an Object Database
99                The C++ Programming Language First Edition
101              The C++ Programming Language Second Edition
114                Advanced C++ Programming Syles and Idioms
134                       The Annotated C++ Reference Manual
156                              Inside the C++ Object Model
1657               WebPowered Databases The Low Level in C++
6527                       Programming languages OOP and C++
8367          Transformation from Test Language ATLAS to C++
9376       Transforming RTPA Mathematical Models of Syste...
20620      FSA An Efficient and Flexible C++ Toolkit for ...
25584      Abstract Interface Types in GNAT Conversions D...
25922              A Preprocessor Approach to Persistent C++
26719      Development and Performance Analysis of a Temp...
33477      PA

In [62]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['abstract'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['+', '#']


In [63]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['abstract'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['abstract']) 

+ Example:
156        Inside the C++ Object Model focuses on the und...
4571       This paper deals with genetic algorithm implem...
6551       Aristotle provides program analysis informatio...
6687       This panel will examine issues related to the ...
6789       This paper proposes an objectoriented developm...
8413       Determining interclass test order is one of th...
8741       Resource reservation is a vital issue for grid...
9943       With the advent of chipmultiprocessors we are ...
11465      The Circuit Object Organization Library is a C...
16477      This paper proposes a new practical automatic ...
22891      An object oriented relational database managem...
24221      Traditionally intrusion detection systems dete...
29249      Practical experience in porting a large virtua...
36763      We discuss scheduling techniques to be used fo...
42053      We present in this paper an expandable simulat...
42217      MC++ is a multigroup Monte Carlo neutron trans...
42272      Mo

In [64]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['venue'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['+']


In [65]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['venue'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['venue']) 

+ Example:
70104      C++ Workshop
70105    C++ Conference
70106    C++ Conference
70107    C++ Conference
70108    C++ Conference
70109    C++ Conference
70110    C++ Conference
70112    C++ Conference
70113      C++ Workshop
70114    C++ Conference
70115    C++ Conference
70116    C++ Conference
70117    C++ Conference
70118    C++ Conference
70119    C++ Conference
70120      C++ Workshop
70121    C++ Conference
70122    C++ Conference
70123    C++ Conference
70124    C++ Conference
70125    C++ Conference
70126      C++ Workshop
70127    C++ Conference
70128      C++ Workshop
70129    C++ Conference
70130    C++ Conference
70131    C++ Conference
70132    C++ Conference
70134    C++ Conference
70135    C++ Conference
70136    C++ Conference
70137      C++ Workshop
70138    C++ Conference
70139      C++ Workshop
70140    C++ Conference
70141    C++ Conference
70142    C++ Conference
70144    C++ Conference
70145    C++ Conference
70146    C++ Conference
70147    C++ Conference
70148

## Add Main Author

In [66]:
def get_main_author(authors_list):
    return authors_list[0]

# Function to extract secondary authors
def get_secondary_authors(authors_list):
    if len(authors_list) > 1:
        return authors_list[1:]  # Return all authors except the first
    else:
        return []  # Return an empty list if there's only one author

# Apply functions to create new columns
clean_data['MAIN_AUTHOR'] = clean_data['authors'].apply(get_main_author)
clean_data['SECONDARY_AUTHORS'] = clean_data['authors'].apply(get_secondary_authors)

In [67]:
clean_data.head(5)

Unnamed: 0,title,authors,year,venue,id,references,abstract,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT,MAIN_AUTHOR,SECONDARY_AUTHORS
0,OQLC++ Extending C++ with an Object Query Capa...,[José A. Blakeley],1995,Modern Database Systems,0,[],,5,1,0,José A. Blakeley,[]
1,Transaction Management in Multidatabase Systems,"[Yuri Breitbart, Hector Garcia-Molina, Abraham...",1995,Modern Database Systems,1,[],,0,3,0,Yuri Breitbart,"[Hector Garcia-Molina, Abraham Silberschatz]"
2,Overview of the ADDS System,"[Yuri Breitbart, Tom C. Reyes]",1995,Modern Database Systems,2,[],,0,2,0,Yuri Breitbart,[Tom C. Reyes]
3,Multimedia Information Systems Issues and Appr...,"[Stavros Christodoulakis, Leonidas Koveos]",1995,Modern Database Systems,3,[],,2,2,0,Stavros Christodoulakis,[Leonidas Koveos]
4,Active Database Systems,"[Umeshwar Dayal, Eric N. Hanson, Jennifer Widom]",1995,Modern Database Systems,4,[995520],,16,3,1,Umeshwar Dayal,"[Eric N. Hanson, Jennifer Widom]"


## Tokenize Title & Abstract

In [68]:
clean_data['TITLE_TOKEN'] = clean_data['title'].str.split()
clean_data['ABSTRACT_TOKEN'] = clean_data['abstract'].str.split()

# Insights

## Author Related

In [71]:
author_citations_sum = clean_data.groupby('MAIN_AUTHOR')['ISCITED_COUNT'].sum().reset_index()
author_citations_sum_sorted = author_citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)
author_citations_sum_sorted.head(50)

Unnamed: 0,MAIN_AUTHOR,ISCITED_COUNT
278997,Rakesh Agrawal,6156
208727,M. R. Garey,4553
43909,C. A. R. Hoare,3160
198347,Leslie Lamport,2968
110103,Gerard Salton,2891
13647,Alfred V. Aho,2775
305814,Serge Abiteboul,2487
291830,Ronald Fagin,2457
232844,Michael Stonebraker,2376
334609,Thomas H. Cormen,2320


## Venue Related

In [72]:
venue_citations_sum = clean_data.groupby('venue')['ISCITED_COUNT'].sum().reset_index()
venue_citations_sum_sorted = venue_citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)
venue_citations_sum_sorted.head(50)

Unnamed: 0,venue,ISCITED_COUNT
989,Commun ACM,54891
4658,SIGMOD Conference,34937
5310,VLDB,34921
0,,34632
4634,SIGGRAPH,28782
3231,J ACM,24647
500,Artif Intell,22622
2577,IEEE Trans Software Eng,19873
2576,IEEE Trans Pattern Anal Mach Intell,18502
2520,IEEE Computer,17918


## Title Related

In [76]:
id_citations_sum = clean_data.groupby('id')['ISCITED_COUNT'].sum().reset_index()

# Step 2: Sort the IDs based on their sum of ISCITED_COUNT in descending order
id_citations_sum_sorted = id_citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)

# Step 3: Select the top 50 IDs
top_50_ids = id_citations_sum_sorted.head(50)['id']

# Step 4: Use the top 50 IDs to retrieve corresponding titles from the 'data' DataFrame
top_50_titles = data[data['id'].isin(top_50_ids)][['id', 'title']]
top_50_titles_with_count = pd.merge(data[data['id'].isin(top_50_ids)], id_citations_sum_sorted, on='id')[['title', 'ISCITED_COUNT']]
top_50_titles_with_count.sort_values(by='ISCITED_COUNT', ascending=False)

Unnamed: 0,title,ISCITED_COUNT
7,Computers and Intractability: A Guide to the T...,4474
8,Introduction to Algorithms,2292
12,Communicating Sequential Processes,1573
0,"Compilers: Princiles, Techniques, and Tools.",1555
26,Mining Association Rules between Sets of Items...,1436
31,Fast Algorithms for Mining Association Rules i...,1415
9,Introduction to Modern Information Retrieval.,1353
1,Modern Information Retrieval,1177
16,Computational Geometry - An Introduction.,1121
39,The Anatomy of a Large-Scale Hypertextual Web ...,1107


# Export CSV

In [77]:
clean_data.to_csv('clean_data.csv', index=False)