# Library Import

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import ast
from collections import Counter
import spacy
from textblob import TextBlob
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
import string

# Data Import

Dataset URL: https://www.aminer.org/citation

In [95]:
data = pd.read_csv('DBLP_citation.csv')

In [96]:
data.head(3)

Unnamed: 0,title,authors,year,venue,id,references,abstract,citation_count
0,OQL[C++]: Extending C++ with an Object Query C...,['José A. Blakeley'],1995,Modern Database Systems,0,[],,5
1,Transaction Management in Multidatabase Systems.,"['Yuri Breitbart', 'Hector Garcia-Molina', 'Ab...",1995,Modern Database Systems,1,[],,0
2,Overview of the ADDS System.,"['Yuri Breitbart', 'Tom C. Reyes']",1995,Modern Database Systems,2,[],,0


In [97]:
# Function to convert string-lists to Python lists
def convert_to_list(data):
    try:
        return ast.literal_eval(data)
    except:
        return []  # Returns an empty list if there's an error in conversion

# Convert the string representations to lists
data['authors'] = data['authors'].apply(convert_to_list)
data['references'] = data['references'].apply(convert_to_list)

# Convert id to string
data['id'] = data['id'].astype(str)

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632442 entries, 0 to 1632441
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   title           1632442 non-null  object
 1   authors         1632442 non-null  object
 2   year            1632442 non-null  int64 
 3   venue           1630753 non-null  object
 4   id              1632442 non-null  object
 5   references      1632442 non-null  object
 6   abstract        653506 non-null   object
 7   citation_count  1632442 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 99.6+ MB


# Preprocessing

## Missing values

In [99]:
data.isna().sum()

title                  0
authors                0
year                   0
venue               1689
id                     0
references             0
abstract          978936
citation_count         0
dtype: int64

In [122]:
preprocessed_data = data.copy()
preprocessed_data['abstract'] = preprocessed_data['abstract'].fillna('')
preprocessed_data['venue'] = preprocessed_data['venue'].fillna('')
preprocessed_data.isna().sum()

title             0
authors           0
year              0
venue             0
id                0
references        0
abstract          0
citation_count    0
dtype: int64

## Create New Feature

In [123]:
preprocessed_data['Author_Count'] = preprocessed_data['authors'].apply(lambda x: len(x))
preprocessed_data['Ref_Count'] = preprocessed_data['references'].apply(lambda x: len(x))

## Rename the column

In [124]:
preprocessed_data.rename(columns={'citation_count': 'IsCited_Count'}, inplace=True)

## Delete Invalid Rows

In [None]:
preprocessed_data['year'].describe()

count    1.632442e+06
mean     1.801409e+03
std      6.012202e+02
min     -1.000000e+00
25%      1.995000e+03
50%      2.003000e+03
75%      2.007000e+03
max      2.011000e+03
Name: year, dtype: float64

In [126]:
preprocessed_data = preprocessed_data[preprocessed_data['year'] <= 1800] # Remove the rows with year = -1

In [None]:
preprocessed_data['authors'].value_counts().sort_values(ascending=False)

Author_Count
2     49182
1     43427
3     32278
4     17863
5      9058
      ...  
41        1
67        1
55        1
68        1
84        1
Name: count, Length: 65, dtype: int64

## Check Language

In [103]:
non_eng_chars = set(''.join(preprocessed_data['title'].str.replace(r'[a-zA-Z\s]+', '')))

print("Non-English characters used in the 'title' column:", non_eng_chars)

Non-English characters used in the 'title' column: {'≤', '±', '过', '超', '/', '算', 'B', 'R', '源', '知', '«', '̌', 'É', '识', 'Ζ', '护', 'Ô', 'c', '理', 'ε', 'ë', '内', '匹', '2', '‘', '¾', '\\', 'û', 'a', '务', '\xad', 'W', '0', 'μ', '组', 'Ì', 'Ö', 'à', '^', '`', 'ö', '“', '合', 'Q', '+', ']', '法', '™', '9', '§', '习', '×', 'ì', 'ù', '\x97', '¥', '现', 'α', 'ó', 'H', 'º', ';', 't', '于', 'X', 'ô', '协', '”', '系', '\x91', '°', '用', 'ß', ' ', '析', '基', '调', '提', '\x99', '程', '络', 'e', 'T', '换', 'Û', 'Ð', 'M', 'y', 'C', '¬', '扩', 'ﬁ', 'i', 'ø', 'Ê', 'Æ', '容', 'Ø', 'Ü', 'P', '¹', 'I', '点', '框', '种', '\x94', 'Î', 'Ξ', '1', 'L', '#', '!', '研', '面', '7', 'é', 'È', '\xa0', '中', '及', 'j', 'ç', 'Ł', '符', '$', '方', 'F', '∞', 'U', '药', '息', '≠', 'Ç', '~', '@', 'Ó', '辑', 'ã', '可', '³', 'Δ', 'J', '4', 'w', '义', '©', 'Õ', '号', 'Φ', '̇', 'å', 'O', '*', 'Ñ', 'G', 'γ', '发', 'o', 'ê', '演', 'è', '流', 'l', '>', 's', ':', 'õ', '述', '信', '保', 'π', '5', 'A', 'θ', 'Í', 'β', '|', 'S', 'Ω', '\x9c', '学', 'ℓ', ',', 'â', '逻', '

In [104]:
english_chars = set(string.ascii_letters + string.punctuation + ' ')

# Function to check if all characters in a title are English
def is_english(col):
    return all(char in english_chars for char in col)

# Filter the DataFrame to keep only rows with English titles
english_only_data = preprocessed_data[preprocessed_data['title'].apply(is_english)]
english_only_data = english_only_data[english_only_data['abstract'].apply(is_english)]
english_only_data = english_only_data[english_only_data['venue'].apply(is_english)]

## Check Punctuation

In [105]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(english_only_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['%', '~', '@', '|', '{', ',', '?', '/', '=', '>', '^', '.', '<', '`', '[', ')', '$', '#', ':', '!', '}', '&', '-', ';', '+', '(', ']', '*', '_', "'", '"', '\\']


In [106]:
for char in non_alphabetic_chars_list:
    titles_with_char = english_only_data[english_only_data['title'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['title'].iloc[3]) 

% Example:
%T Museum and Interactive Multimedia Information Service.
~ Example:
Segmentation of Range Images in a~Quadtree.
@ Example:
Labeled @-Calculus: Formalism for Time-Concerned Human Factors.
| Example:
An O(sqrt(|v|) |E|) Algorithm for Finding Maximum Matching in General Graphs
{ Example:
A Lazy and Layered SMT($\mathcal{BV}$) Solver for Hard Industrial Verification Problems.
, Example:
Title, Preface, Contents.
? Example:
Logic Programming - Past or Future?
/ Example:
TCP/IP Architecture, Protocols, and Services.
= Example:
A Parallel Time/Processor Tradeoff T.P=O(n^(log M)/M) for the Subset-Sum Problem.
> Example:
Turn it <u>this</u> way: grounding collaborative action with remote gestures.
^ Example:
Software Offshoring ?^(a) Risks and Opportunities for Software Engineering Programs.
. Example:
Multimedia Information Systems: Issues and Approaches.
< Example:
"<head>, <body>, Links and Code: An Introduction to Using HTML to Present Your Data".
` Example:
Using ``Live Informa

In [107]:
def clean_column(column):
    # Use raw strings for regex. Protect 'C++' and 'C#'
    column = column.str.replace(r'C\+\+', 'TEMPORARYCPP', regex=True)
    column = column.str.replace(r'C#', 'TEMPORARYCSHARP', regex=True)
    
    # Remove all non-alphabetic characters except spaces
    column = column.str.replace(r'[^a-zA-Z\s]', '', regex=True)
    
    # Restore 'C++' and 'C#'
    column = column.str.replace('TEMPORARYCPP', 'C++', regex=False)
    column = column.str.replace('TEMPORARYCSHARP', 'C#', regex=False)
    
    return column

# Clean the 'title', 'abstract', and 'venue' columns
clean_data = english_only_data.copy()
clean_data['title'] = clean_column(clean_data['title'])
clean_data['abstract'] = clean_column(clean_data['abstract'])
clean_data['venue'] = clean_column(clean_data['venue'])

In [108]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['#', '+']


In [110]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['title'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['title']) 

# Example:
35309      Automatic Generation of the C# Code for Securi...
46809      Generic Programming for Scientific Computing i...
51838       Specification and Implementation Problems for C#
51891      An ASM Specification of C# Threads and the NET...
198679                        Adding Context Awareness to C#
                                 ...                        
1579518    The Implemention of ChineseTai Lue Electronic ...
1597330    Realization of UML Class and State Machine Mod...
1600252    Darwins World Simulation in C# The ModelView C...
1600254        Darwins World Simulation in C# An Interpreter
1613717    Code C# for chaos analysis of relativistic man...
Name: title, Length: 73, dtype: object
+ Example:
0          OQLC++ Extending C++ with an Object Query Capa...
7          An ObjectOriented DBMS War Story Developing a ...
18                        C++ Bindings to an Object Database
99                The C++ Programming Language First Edition
101              The C++

In [111]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['abstract'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['#', '+']


In [112]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['abstract'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['abstract']) 

# Example:
16477      This paper proposes a new practical automatic ...
24694      The term metaprogramming language is used to d...
30159      Presenter First PF is a technique for organizi...
30228      One day our customer a famous chip producer su...
32488      Dealing with crosscutting requirements in soft...
                                 ...                        
1545994    We present a new programming model GUEESSTIMAT...
1545999    Execution order constraints imposed by depende...
1547553    Specifying application interfaces APIs with in...
1579518    Dictionary is one of the most important charac...
1579790    Web Services for Automated Fault Analysis WSAF...
Name: abstract, Length: 145, dtype: object
+ Example:
156        Inside the C++ Object Model focuses on the und...
4571       This paper deals with genetic algorithm implem...
6551       Aristotle provides program analysis informatio...
6687       This panel will examine issues related to the ...
6789       This pape

In [113]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['venue'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['+']


In [114]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['venue'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['venue']) 

+ Example:
70104      C++ Workshop
70105    C++ Conference
70106    C++ Conference
70107    C++ Conference
70108    C++ Conference
              ...      
70236    C++ Conference
70237    C++ Conference
70238    C++ Conference
70239      C++ Workshop
70240    C++ Conference
Name: venue, Length: 129, dtype: object


In [115]:
clean_data.head(5)

Unnamed: 0,title,authors,year,venue,id,references,abstract,IsCited_Count,Author_Count,Ref_Count
0,OQLC++ Extending C++ with an Object Query Capa...,[José A. Blakeley],1995,Modern Database Systems,0,[],,5,1,0
1,Transaction Management in Multidatabase Systems,"[Yuri Breitbart, Hector Garcia-Molina, Abraham...",1995,Modern Database Systems,1,[],,0,3,0
2,Overview of the ADDS System,"[Yuri Breitbart, Tom C. Reyes]",1995,Modern Database Systems,2,[],,0,2,0
3,Multimedia Information Systems Issues and Appr...,"[Stavros Christodoulakis, Leonidas Koveos]",1995,Modern Database Systems,3,[],,2,2,0
4,Active Database Systems,"[Umeshwar Dayal, Eric N. Hanson, Jennifer Widom]",1995,Modern Database Systems,4,[995520],,16,3,1


In [None]:
import pandas as pd
import re

def clean_column(column):
    # Step 1: Temporarily replace 'C++' and 'C#' to protect them
    column = column.str.replace(r'C\+\+', 'TEMP_CPP', regex=True)
    column = column.str.replace(r'C#', 'TEMP_CSHARP', regex=True)
    
    # Step 2: Remove all other non-alphabetic characters except spaces
    column = column.str.replace(r'[^a-zA-Z\s]', '', regex=True)
    
    # Step 3: Restore 'C++' and 'C#'
    column = column.str.replace('TEMP_CPP', 'C++', regex=False)
    column = column.str.replace('TEMP_CSHARP', 'C#', regex=False)
    
    return column

# Example DataFrame
data = {'title': ["C++ Programming Basics", "New in C#: Features", "Python-3.8.1"]}
english_only_data = pd.DataFrame(data)

# Clean the 'title' column
clean_data = english_only_data.copy()
clean_data['title'] = clean_column(english_only_data['title'])

# Check for non-alphabetic characters after cleaning
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

# Display cleaned data
print(clean_data)


List of unique non-alphabetic characters: []
                        title
0  TEMPCPP Programming Basics
1  New in TEMPCSHARP Features
2                      Python
