In [1]:
import pandas as pd

In [2]:
def parse_citation_file(file_path):
    papers = []  # List to hold all papers
    with open(file_path, 'r') as file:
        paper = {}  # Dictionary to store the current paper's details
        references = []  # List to store multiple reference IDs for the current paper
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if line.startswith('#*'):
                if paper:  # If moving to a new paper, save the last one
                    paper['references'] = references
                    papers.append(paper)
                    paper = {}
                    references = []
                paper['title'] = line[2:]  # Remove the '#*' and save the title
            elif line.startswith('#@'):
                paper['authors'] = line[2:].split(',')  # Split authors by comma
            elif line.startswith('#t'):
                paper['year'] = int(line[2:])  # Convert year to integer
            elif line.startswith('#c'):
                paper['venue'] = line[2:] if line[2:] else None  # Handle empty venues
            elif line.startswith('#index'):
                paper['id'] = line[6:]  # Save the index as id
            elif line.startswith('#!'):
                paper['abstract'] = line[2:]  # Save the abstract
            elif line.startswith('#%'):
                references.append(line[2:])  # Append reference ID to the list
        if paper:  # Add the last paper if the file doesn't end with an empty line
            paper['references'] = references
            papers.append(paper)

    return papers

# File path to the citation.txt
file_path = 'DBLPOnlyCitationOct19.txt'
papers = parse_citation_file(file_path)

# Convert list of dictionaries to DataFrame
papers_df = pd.DataFrame(papers)

# Replace NaN in 'references' with empty lists
papers_df['references'] = papers_df['references'].apply(lambda x: x if isinstance(x, list) else [])

# Create a dictionary to count references
reference_count = {}

# Count each id in all references lists
for references in papers_df['references']:
    for ref in references:
        if ref in reference_count:
            reference_count[ref] += 1
        else:
            reference_count[ref] = 1

# Map the counts back to the DataFrame
papers_df['citation_count'] = papers_df['id'].map(reference_count).fillna(0).astype(int)


In [3]:
papers_df.head(50)

Unnamed: 0,title,authors,year,venue,id,references,abstract,citation_count
0,OQL[C++]: Extending C++ with an Object Query C...,[José A. Blakeley],1995,Modern Database Systems,0,[],,5
1,Transaction Management in Multidatabase Systems.,"[Yuri Breitbart, Hector Garcia-Molina, Abraham...",1995,Modern Database Systems,1,[],,0
2,Overview of the ADDS System.,"[Yuri Breitbart, Tom C. Reyes]",1995,Modern Database Systems,2,[],,0
3,Multimedia Information Systems: Issues and App...,"[Stavros Christodoulakis, Leonidas Koveos]",1995,Modern Database Systems,3,[],,2
4,Active Database Systems.,"[Umeshwar Dayal, Eric N. Hanson, Jennifer Widom]",1995,Modern Database Systems,4,[995520],,16
5,Where Object-Oriented DBMSs Should Do Better: ...,"[Angelika Kotz Dittrich, Klaus R. Dittrich]",1995,Modern Database Systems,5,[],,1
6,Distributed Databases.,"[Hector Garcia-Molina, Meichun Hsu]",1995,Modern Database Systems,6,[],,0
7,An Object-Oriented DBMS War Story: Developing ...,[Nathan Goodman],1995,Modern Database Systems,7,[],,4
8,Cooperative Transactions for Multiuser Environ...,[Gail E. Kaiser],1995,Modern Database Systems,8,[],,12
9,Schema Architecture of the UniSQL/M Multidatab...,"[William Kelley, Sunit K. Gala, Won Kim, Tom C...",1995,Modern Database Systems,9,[],,7


In [4]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632442 entries, 0 to 1632441
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   title           1632442 non-null  object
 1   authors         1632442 non-null  object
 2   year            1632442 non-null  int64 
 3   venue           1630753 non-null  object
 4   id              1632442 non-null  object
 5   references      1632442 non-null  object
 6   abstract        653510 non-null   object
 7   citation_count  1632442 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 99.6+ MB


In [5]:
pd.set_option('display.max_rows', None)
papers_df['citation_count'].value_counts()

citation_count
0       1293828
1        116791
2         56802
3         34477
4         23418
5         16801
6         12605
7          9826
8          8097
9          6440
10         5478
11         4569
12         3923
13         3394
14         2883
15         2593
16         2312
17         2061
18         1736
19         1633
20         1458
21         1307
22         1122
23         1090
24          969
25          884
26          799
27          740
28          681
29          665
30          571
31          558
32          537
33          498
34          458
35          415
37          400
36          397
38          346
39          342
40          337
41          298
42          292
44          290
43          253
47          232
46          230
48          209
45          209
51          203
49          193
52          179
53          172
50          171
54          147
56          142
55          140
58          128
57          121
61          119
62          117
64       

In [6]:
papers_df['citation_count'].nunique()

452

In [7]:
papers_df.to_csv('DBLP_citation.csv', index=False)