# Library Import

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import ast
from collections import Counter
import spacy
from textblob import TextBlob
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
import string
from itertools import combinations
from collections import defaultdict

# Data Import

Dataset URL: https://www.aminer.org/citation

In [2]:
data = pd.read_csv('DBLP_citation.csv')

In [3]:
data.head(3)

Unnamed: 0,title,authors,year,venue,id,references,abstract,citation_count
0,OQL[C++]: Extending C++ with an Object Query C...,['José A. Blakeley'],1995,Modern Database Systems,0,[],,5
1,Transaction Management in Multidatabase Systems.,"['Yuri Breitbart', 'Hector Garcia-Molina', 'Ab...",1995,Modern Database Systems,1,[],,0
2,Overview of the ADDS System.,"['Yuri Breitbart', 'Tom C. Reyes']",1995,Modern Database Systems,2,[],,0


In [4]:
# Function to convert string-lists to Python lists
def convert_to_list(data):
    try:
        return ast.literal_eval(data)
    except:
        return []  # Returns an empty list if there's an error in conversion

# Convert the string representations to lists
data['authors'] = data['authors'].apply(convert_to_list)
data['references'] = data['references'].apply(convert_to_list)

# Convert id to string
data['id'] = data['id'].astype(str)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632442 entries, 0 to 1632441
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   title           1632442 non-null  object
 1   authors         1632442 non-null  object
 2   year            1632442 non-null  int64 
 3   venue           1630753 non-null  object
 4   id              1632442 non-null  object
 5   references      1632442 non-null  object
 6   abstract        653506 non-null   object
 7   citation_count  1632442 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 99.6+ MB


# Preprocessing

## Missing values

In [6]:
data.isna().sum()

title                  0
authors                0
year                   0
venue               1689
id                     0
references             0
abstract          978936
citation_count         0
dtype: int64

In [7]:
preprocessed_data = data.copy()
preprocessed_data['abstract'] = preprocessed_data['abstract'].fillna('')
preprocessed_data['venue'] = preprocessed_data['venue'].fillna('')
preprocessed_data.isna().sum()

title             0
authors           0
year              0
venue             0
id                0
references        0
abstract          0
citation_count    0
dtype: int64

## Create New Feature

In [8]:
preprocessed_data['AUTHOR_COUNT'] = preprocessed_data['authors'].apply(lambda x: len(x))
preprocessed_data['REF_COUNT'] = preprocessed_data['references'].apply(lambda x: len(x))

## Rename the column

In [9]:
preprocessed_data.rename(columns={'citation_count': 'ISCITED_COUNT'}, inplace=True)

## Delete Invalid Rows

In [10]:
preprocessed_data['year'].describe()

count    1.632442e+06
mean     1.801409e+03
std      6.012202e+02
min     -1.000000e+00
25%      1.995000e+03
50%      2.003000e+03
75%      2.007000e+03
max      2.011000e+03
Name: year, dtype: float64

In [11]:
preprocessed_data = preprocessed_data[preprocessed_data['year'] >= 1800] # Remove the rows with year = -1

In [12]:
preprocessed_data.describe()

Unnamed: 0,year,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT
count,1469015.0,1469015.0,1469015.0,1469015.0
mean,2001.926,1.580486,2.612094,1.583045
std,7.600157,11.03237,1.54726,4.705101
min,1936.0,0.0,1.0,0.0
25%,1999.0,0.0,2.0,0.0
50%,2004.0,0.0,2.0,0.0
75%,2007.0,0.0,3.0,0.0
max,2011.0,4474.0,114.0,617.0


## Check Language

In [13]:
non_eng_chars = set(''.join(preprocessed_data['title'].str.replace(r'[a-zA-Z\s]+', '')))

print("Non-English characters used in the 'title' column:", non_eng_chars)

Non-English characters used in the 'title' column: {'n', '药', 'o', '自', '³', '取', '«', '\x9c', '¥', '-', 'Ê', 'e', '调', 'ú', '>', '过', ')', 'i', '动', '©', 'y', '¯', 'ò', '\xa0', 'Ω', '知', '流', 'Ö', '识', 'µ', ',', '≠', 'S', '医', 'â', 'ℓ', 'α', '推', '¬', '…', '~', '法', 'Ø', '号', ' ', 'Ï', '研', '“', 'J', '中', 'Σ', '架', '\x99', '方', 'Û', 'Á', '»', 'k', '¹', '<', '辑', '习', '7', 'μ', '扩', 'F', '0', '程', '\x92', '±', 'ü', 'B', '(', '$', '´', '×', 'Â', '™', '模', '务', 'Z', '®', '£', 'ê', 'ô', '*', 'H', 'K', '6', '下', '源', 'ª', '匹', 's', 'Æ', '§', '–', '的', '本', '框', 'z', '于', 'ó', 'x', '4', 'Δ', '：', '网', 'Ò', 'G', '超', '义', 'V', 'í', '基', '可', '理', '.', 'E', '协', 'h', '及', '服', 'U', '‘', '°', '"', ';', '描', '@', 'û', 'D', '面', 'M', '8', 'Ô', '文', '述', '\xad', '\x94', '#', '符', '体', 'É', 'ë', '系', '究', '配', 'ä', '9', '2', '½', 'Ł', '变', 'q', 'ε', '+', 'ﬁ', 'd', 'A', 'ć', '`', '建', 'π', 'à', 'L', '[', '资', '?', 'λ', 'ø', '\x84', 'g', '演', '²', 't', '制', 'p', 'á', '一', '逻', '容', '•', 'ý', '定', 'W

In [14]:
english_chars = set(string.ascii_letters + string.punctuation + ' ')

# Function to check if all characters in a title are English
def is_english(col):
    return all(char in english_chars for char in col)

# Filter the DataFrame to keep only rows with English titles
english_only_data = preprocessed_data[preprocessed_data['title'].apply(is_english)]
english_only_data = english_only_data[english_only_data['abstract'].apply(is_english)]
english_only_data = english_only_data[english_only_data['venue'].apply(is_english)]

## Check Punctuation

In [15]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(english_only_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['(', '$', '+', '`', '*', '-', '!', '[', '?', '>', ')', ':', ',', '^', '~', "'", '.', '=', '"', ';', '@', '/', '&', '<', '_', '\\', '#', '%', ']', '{', '|', '}']


In [16]:
for char in non_alphabetic_chars_list:
    titles_with_char = english_only_data[english_only_data['title'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['title'].iloc[3]) 

( Example:
Open System Architecture for Controls within Automation Systems (OSACA).
$ Example:
An average running time analysis of a backtracking algorithm to calculate the measure of the union of hyperrectangles in $d$ dimensions.
+ Example:
OZ+: An Object-Oriented Database System.
` Example:
Using ``Live Information'' in a Multimedia Framework.
* Example:
Leveraging Distance Table to Accelerate Data Location in Large-Scaled Data Intensive Grid Environment*.
- Example:
Requirements for a Performance Benchmark for Object-Oriented Database Systems.
! Example:
Like Rome, a mobile operator's empire wasn't built in a day!: a journey through the rise and fall of mobile network operators.
[ Example:
Operating System Support [for Multimedia Databases].
? Example:
Logic Programming - Past or Future?
> Example:
Turn it <u>this</u> way: grounding collaborative action with remote gestures.
) Example:
Open System Architecture for Controls within Automation Systems (OSACA).
: Example:
An Object-Ori

In [17]:
def clean_column(column):
    # Use raw strings for regex. Protect 'C++' and 'C#'
    column = column.str.replace(r'C\+\+', 'TEMPORARYCPP', regex=True)
    column = column.str.replace(r'C#', 'TEMPORARYCSHARP', regex=True)
    
    # Remove all non-alphabetic characters except spaces
    column = column.str.replace(r'[^a-zA-Z\s]', '', regex=True)
    
    # Restore 'C++' and 'C#'
    column = column.str.replace('TEMPORARYCPP', 'C++', regex=False)
    column = column.str.replace('TEMPORARYCSHARP', 'C#', regex=False)
    
    return column

# Clean the 'title', 'abstract', and 'venue' columns
clean_data = english_only_data.copy()
clean_data['title'] = clean_column(clean_data['title'])
clean_data['abstract'] = clean_column(clean_data['abstract'])
clean_data['venue'] = clean_column(clean_data['venue'])

In [18]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['title'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['#', '+']


In [19]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['title'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['title'].iloc[3]) 

# Example:
An ASM Specification of C# Threads and the NET Memory Model
+ Example:
The C++ Programming Language First Edition


In [20]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['abstract'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['#', '+']


In [21]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['abstract'].str.contains(re.escape(char))]
    print(f'{char} Example:')
    print(titles_with_char['abstract'].iloc[3]) 

# Example:
One day our customer a famous chip producer suggested a project which was very unusual for us We had to develop a system with PDA client using C# Microsoft Compact NET library and SQL CE Application data was stored in a huge thirdpart master database with hundreds of tables We had never worked with PDAs before and our customer itself was not sure about the whole idea To complicate things further we were living in a different continent from our customers We cautiously started with a two week technology spike using Extreme Programming This experience report describes the challenges we faced applying XP to offshore development and how we overcame those challenges to deliver a successful product to our customers
+ Example:
This panel will examine issues related to the integration of the Ada programming language into the undergraduate computer science curriculum Topics will include the followingbull The use of Ada versus other languages eg C++ in CS I and IIbull Approaches to mak

In [22]:
non_alphabetic_chars_set = set(re.findall(r'[^a-zA-Z\s]', ''.join(clean_data['venue'])))
non_alphabetic_chars_list = list(non_alphabetic_chars_set)
print("List of unique non-alphabetic characters:", non_alphabetic_chars_list)

List of unique non-alphabetic characters: ['+']


In [23]:
for char in non_alphabetic_chars_list:
    titles_with_char = clean_data[clean_data['venue'].str.contains(re.escape(char))]
    unique_venues = titles_with_char['venue'].unique()
    print(f'{char} Example:')
    print(unique_venues)

+ Example:
['C++ Workshop' 'C++ Conference']


## Add Main Author

In [24]:
def get_main_author(authors_list):
    return authors_list[0]

# Function to extract secondary authors
def get_secondary_authors(authors_list):
    if len(authors_list) > 1:
        return authors_list[1:]  # Return all authors except the first
    else:
        return []  # Return an empty list if there's only one author

# Apply functions to create new columns
clean_data['MAIN_AUTHOR'] = clean_data['authors'].apply(get_main_author)
clean_data['SECONDARY_AUTHORS'] = clean_data['authors'].apply(get_secondary_authors)

In [25]:
clean_data.head(5)

Unnamed: 0,title,authors,year,venue,id,references,abstract,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT,MAIN_AUTHOR,SECONDARY_AUTHORS
0,OQLC++ Extending C++ with an Object Query Capa...,[José A. Blakeley],1995,Modern Database Systems,0,[],,5,1,0,José A. Blakeley,[]
1,Transaction Management in Multidatabase Systems,"[Yuri Breitbart, Hector Garcia-Molina, Abraham...",1995,Modern Database Systems,1,[],,0,3,0,Yuri Breitbart,"[Hector Garcia-Molina, Abraham Silberschatz]"
2,Overview of the ADDS System,"[Yuri Breitbart, Tom C. Reyes]",1995,Modern Database Systems,2,[],,0,2,0,Yuri Breitbart,[Tom C. Reyes]
3,Multimedia Information Systems Issues and Appr...,"[Stavros Christodoulakis, Leonidas Koveos]",1995,Modern Database Systems,3,[],,2,2,0,Stavros Christodoulakis,[Leonidas Koveos]
4,Active Database Systems,"[Umeshwar Dayal, Eric N. Hanson, Jennifer Widom]",1995,Modern Database Systems,4,[995520],,16,3,1,Umeshwar Dayal,"[Eric N. Hanson, Jennifer Widom]"


## Tokenize Title & Abstract

In [26]:
clean_data['TITLE_TOKEN'] = clean_data['title'].str.split().apply(lambda x: [word.lower() for word in x])
clean_data['ABSTRACT_TOKEN'] = clean_data['abstract'].str.split().apply(lambda x: [word.lower() for word in x])

In [27]:
clean_data.head(5)

Unnamed: 0,title,authors,year,venue,id,references,abstract,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT,MAIN_AUTHOR,SECONDARY_AUTHORS,TITLE_TOKEN,ABSTRACT_TOKEN
0,OQLC++ Extending C++ with an Object Query Capa...,[José A. Blakeley],1995,Modern Database Systems,0,[],,5,1,0,José A. Blakeley,[],"[oqlc++, extending, c++, with, an, object, que...",[]
1,Transaction Management in Multidatabase Systems,"[Yuri Breitbart, Hector Garcia-Molina, Abraham...",1995,Modern Database Systems,1,[],,0,3,0,Yuri Breitbart,"[Hector Garcia-Molina, Abraham Silberschatz]","[transaction, management, in, multidatabase, s...",[]
2,Overview of the ADDS System,"[Yuri Breitbart, Tom C. Reyes]",1995,Modern Database Systems,2,[],,0,2,0,Yuri Breitbart,[Tom C. Reyes],"[overview, of, the, adds, system]",[]
3,Multimedia Information Systems Issues and Appr...,"[Stavros Christodoulakis, Leonidas Koveos]",1995,Modern Database Systems,3,[],,2,2,0,Stavros Christodoulakis,[Leonidas Koveos],"[multimedia, information, systems, issues, and...",[]
4,Active Database Systems,"[Umeshwar Dayal, Eric N. Hanson, Jennifer Widom]",1995,Modern Database Systems,4,[995520],,16,3,1,Umeshwar Dayal,"[Eric N. Hanson, Jennifer Widom]","[active, database, systems]",[]


# Insights

## Author Related

In [28]:
author_citations_sum = clean_data.groupby('MAIN_AUTHOR')['ISCITED_COUNT'].sum().reset_index()
author_citations_sum_sorted = author_citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)
author_citations_sum_sorted.head(50)

Unnamed: 0,MAIN_AUTHOR,ISCITED_COUNT
278997,Rakesh Agrawal,6156
208727,M. R. Garey,4553
43909,C. A. R. Hoare,3160
198347,Leslie Lamport,2968
110103,Gerard Salton,2891
13647,Alfred V. Aho,2775
305814,Serge Abiteboul,2487
291830,Ronald Fagin,2457
232844,Michael Stonebraker,2376
334609,Thomas H. Cormen,2320


## Venue Related

In [29]:
venue_citations_sum = clean_data.groupby('venue')['ISCITED_COUNT'].sum().reset_index()
venue_citations_sum_sorted = venue_citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)
venue_citations_sum_sorted.head(50)

Unnamed: 0,venue,ISCITED_COUNT
989,Commun ACM,54891
4658,SIGMOD Conference,34937
5310,VLDB,34921
0,,34632
4634,SIGGRAPH,28782
3231,J ACM,24647
500,Artif Intell,22622
2577,IEEE Trans Software Eng,19873
2576,IEEE Trans Pattern Anal Mach Intell,18502
2520,IEEE Computer,17918


## Title Related

In [30]:
id_citations_sum = clean_data.groupby('id')['ISCITED_COUNT'].sum().reset_index()

# Step 2: Sort the IDs based on their sum of ISCITED_COUNT in descending order
id_citations_sum_sorted = id_citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)

# Step 3: Select the top 50 IDs
top_50_ids = id_citations_sum_sorted.head(50)['id']

# Step 4: Use the top 50 IDs to retrieve corresponding titles from the 'data' DataFrame
top_50_titles = data[data['id'].isin(top_50_ids)][['id', 'title']]
top_50_titles_with_count = pd.merge(data[data['id'].isin(top_50_ids)], id_citations_sum_sorted, on='id')[['title', 'ISCITED_COUNT']]
top_50_titles_with_count.sort_values(by='ISCITED_COUNT', ascending=False)

Unnamed: 0,title,ISCITED_COUNT
7,Computers and Intractability: A Guide to the T...,4474
8,Introduction to Algorithms,2292
12,Communicating Sequential Processes,1573
0,"Compilers: Princiles, Techniques, and Tools.",1555
26,Mining Association Rules between Sets of Items...,1436
31,Fast Algorithms for Mining Association Rules i...,1415
9,Introduction to Modern Information Retrieval.,1353
1,Modern Information Retrieval,1177
16,Computational Geometry - An Introduction.,1121
39,The Anatomy of a Large-Scale Hypertextual Web ...,1107


# Export CSV

In [31]:
clean_data.to_csv('clean_data.csv', index=False)

# Sample: Top 20 paper from sample dataset

In [100]:
data = pd.read_csv('/Users/qianlou/Documents/GitHub/Social-Media-Analysis-Project/Group Project/Sampling Data/citation_sample.csv')

In [101]:
data['id'] = data['id'].astype(str)

In [102]:
citations_sum = data.groupby('id')['ISCITED_COUNT'].sum().reset_index()
citations_sum_sorted = citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)
top_20_list = citations_sum_sorted['id'].head(20).tolist()
top_20_list

['1118192',
 '145',
 '1122129',
 '20383',
 '774896',
 '613277',
 '511048',
 '53801',
 '621950',
 '598655',
 '642502',
 '832504',
 '1090119',
 '96007',
 '744738',
 '642447',
 '598853',
 '514059',
 '593638',
 '418675']

In [103]:
data['references'] = data['references'].apply(ast.literal_eval)

In [104]:
filtered_by_id = data[data['id'].isin(top_20_list)]

# Filter based on any 'reference' id being in interested_ids
filtered_by_reference = data[data['references'].apply(lambda refs: any(ref in top_20_list for ref in refs))]


In [105]:
unique_references = set(ref for sublist in filtered_by_id['references'] for ref in sublist)

# Convert the set back to a list if you need it as a list
unique_references_list = list(unique_references)

top20_reference = data[data['references'].apply(lambda refs: any(ref in unique_references_list for ref in refs))]


In [107]:
# Combine
combined_df = pd.concat([filtered_by_id, filtered_by_reference, top20_reference])

In [108]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 171 entries, 291 to 12080
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   index              171 non-null    int64 
 1   title              171 non-null    object
 2   authors            171 non-null    object
 3   year               171 non-null    int64 
 4   venue              170 non-null    object
 5   id                 171 non-null    object
 6   references         171 non-null    object
 7   abstract           118 non-null    object
 8   ISCITED_COUNT      171 non-null    int64 
 9   AUTHOR_COUNT       171 non-null    int64 
 10  REF_COUNT          171 non-null    int64 
 11  MAIN_AUTHOR        171 non-null    object
 12  SECONDARY_AUTHORS  171 non-null    object
 13  TITLE_TOKEN        171 non-null    object
 14  ABSTRACT_TOKEN     171 non-null    object
dtypes: int64(5), object(10)
memory usage: 21.4+ KB


In [109]:
combined_df['abstract'] = combined_df['abstract'].fillna('')
combined_df['venue'] = combined_df['venue'].fillna('')

In [110]:
combined_df.to_csv('top_20_sample.csv', index=False)

## Author Connection From Sample Dataset

In [111]:
combined_df['authors'] = combined_df['authors'].apply(ast.literal_eval)

# Flatten the list of all authors and get unique authors
all_authors = set(author for sublist in combined_df['authors'] for author in sublist)


In [112]:
authors_list = list(all_authors)

In [113]:
collab_matrix = pd.DataFrame(index=authors_list, columns=authors_list).fillna(0)

# Function to update the matrix for each list of authors in a paper
def update_collab_matrix(row):
    authors = row['authors']
    for a1, a2 in combinations(authors, 2):
        collab_matrix.at[a1, a2] += 1
        collab_matrix.at[a2, a1] += 1

# Apply the function to each row in the DataFrame
combined_df.apply(update_collab_matrix, axis=1)

# Set diagonal (self-collaboration) to NaN
pd.options.mode.chained_assignment = None  # to suppress SettingWithCopyWarning
for author in authors_list:
    collab_matrix.at[author, author] = None

  collab_matrix = pd.DataFrame(index=authors_list, columns=authors_list).fillna(0)


In [117]:
collab_matrix.to_csv('collab_matrix_sample.csv')

# Sample: Top 3 paper from preprocessed dataset

In [118]:
data = pd.read_csv('clean_data.csv')

In [119]:
data['id'] = data['id'].astype(str)

In [120]:
citations_sum = data.groupby('id')['ISCITED_COUNT'].sum().reset_index()
citations_sum_sorted = citations_sum.sort_values(by='ISCITED_COUNT', ascending=False)
top_3_list = citations_sum_sorted['id'].head(3).tolist()
top_3_list

['759', '2020', '2595']

In [121]:
data['references'] = data['references'].apply(ast.literal_eval)

In [122]:
filtered_by_id = data[data['id'].isin(top_3_list)]

# Filter based on any 'reference' id being in interested_ids
filtered_by_reference = data[data['references'].apply(lambda refs: any(ref in top_3_list for ref in refs))]


In [123]:
unique_references = set(ref for sublist in filtered_by_id['references'] for ref in sublist)

# Convert the set back to a list if you need it as a list
unique_references_list = list(unique_references)

top3_reference = data[data['references'].apply(lambda refs: any(ref in unique_references_list for ref in refs))]


In [124]:
# Combine
combined_df = pd.concat([filtered_by_id, filtered_by_reference, top3_reference])

In [125]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5416 entries, 661 to 1001297
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              5416 non-null   object
 1   authors            5416 non-null   object
 2   year               5416 non-null   int64 
 3   venue              5404 non-null   object
 4   id                 5416 non-null   object
 5   references         5416 non-null   object
 6   abstract           4360 non-null   object
 7   ISCITED_COUNT      5416 non-null   int64 
 8   AUTHOR_COUNT       5416 non-null   int64 
 9   REF_COUNT          5416 non-null   int64 
 10  MAIN_AUTHOR        5416 non-null   object
 11  SECONDARY_AUTHORS  5416 non-null   object
 12  TITLE_TOKEN        5416 non-null   object
 13  ABSTRACT_TOKEN     5416 non-null   object
dtypes: int64(4), object(10)
memory usage: 634.7+ KB


In [128]:
combined_df['abstract'] = combined_df['abstract'].fillna('')
combined_df['venue'] = combined_df['venue'].fillna('')

In [131]:
combined_df.to_csv('top_3_paper.csv', index=False)

## Author Connection From Sample Dataset

In [132]:
combined_df['authors'] = combined_df['authors'].apply(ast.literal_eval)

# Flatten the list of all authors and get unique authors
all_authors = set(author for sublist in combined_df['authors'] for author in sublist)


In [133]:
authors_list = list(all_authors)

In [136]:
collab_matrix = pd.DataFrame(index=authors_list, columns=authors_list).fillna(0)

# Function to update the matrix for each list of authors in a paper
def update_collab_matrix(row):
    authors = row['authors']
    for a1, a2 in combinations(authors, 2):
        collab_matrix.at[a1, a2] += 1
        collab_matrix.at[a2, a1] += 1

# Apply the function to each row in the DataFrame
combined_df.apply(update_collab_matrix, axis=1)

# Set diagonal (self-collaboration) to NaN
pd.options.mode.chained_assignment = None  # to suppress SettingWithCopyWarning
for author in authors_list:
    collab_matrix.at[author, author] = None

  collab_matrix = pd.DataFrame(index=authors_list, columns=authors_list).fillna(0)


In [137]:
collab_matrix.to_csv('collab_matrix_top3.csv')

In [138]:
collab_matrix

Unnamed: 0,W. L. Yeung,Adam Kasperski,Mohammad Kaykobad,Cyrus Shahabi,Peter Ross,Miodrag Potkonjak,Maria J. Serna,Yahui Lu,Avenir Kobetski,Vincent A. Cicirello,...,Takumi Okamoto,Peter Triantafillou,Thomas Santen,Chen-Shang Lin,Per Nyblom,João P. Marques Silva,Johannes Hatzl,Subhashis Majumder,Philip W. Trinder,Javier Larrosa
W. L. Yeung,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adam Kasperski,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mohammad Kaykobad,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cyrus Shahabi,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peter Ross,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
João P. Marques Silva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
Johannes Hatzl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
Subhashis Majumder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
Philip W. Trinder,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


# Use all sample

In [141]:
data = pd.read_csv('/Users/qianlou/Documents/GitHub/Social-Media-Analysis-Project/Group Project/Sampling Data/citation_sample.csv')
data['references'] = data['references'].apply(ast.literal_eval)
data['authors'] = data['authors'].apply(ast.literal_eval)
data['id'] = data['id'].astype(str)

In [142]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12101 entries, 0 to 12100
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   index              12101 non-null  int64 
 1   title              12101 non-null  object
 2   authors            12101 non-null  object
 3   year               12101 non-null  int64 
 4   venue              12096 non-null  object
 5   id                 12101 non-null  object
 6   references         12101 non-null  object
 7   abstract           3283 non-null   object
 8   ISCITED_COUNT      12101 non-null  int64 
 9   AUTHOR_COUNT       12101 non-null  int64 
 10  REF_COUNT          12101 non-null  int64 
 11  MAIN_AUTHOR        12012 non-null  object
 12  SECONDARY_AUTHORS  12101 non-null  object
 13  TITLE_TOKEN        12101 non-null  object
 14  ABSTRACT_TOKEN     12101 non-null  object
dtypes: int64(5), object(10)
memory usage: 1.4+ MB


In [144]:
data.to_csv('full_sample.csv', index=False)

In [146]:
all_authors = set(author for sublist in data['authors'] for author in sublist)
authors_list = list(all_authors)
collab_matrix = pd.DataFrame(index=authors_list, columns=authors_list).fillna(0)

# Function to update the matrix for each list of authors in a paper
def update_collab_matrix(row):
    authors = row['authors']
    for a1, a2 in combinations(authors, 2):
        collab_matrix.at[a1, a2] += 1
        collab_matrix.at[a2, a1] += 1

# Apply the function to each row in the DataFrame
data.apply(update_collab_matrix, axis=1)

# Set diagonal (self-collaboration) to NaN
pd.options.mode.chained_assignment = None  # to suppress SettingWithCopyWarning
for author in authors_list:
    collab_matrix.at[author, author] = None

  collab_matrix = pd.DataFrame(index=authors_list, columns=authors_list).fillna(0)


In [147]:
collab_matrix.to_csv('collab_matrix_fullsample.csv', index=False)

In [148]:
collab_matrix

Unnamed: 0,Unnamed: 1,Luigia Carlucci,Vivian Lord,Dimitrij Surmeli,C. R. Wan,D. Todd Nay,Kuan-Ching Li,Miodrag Potkonjak,Johanna D. Moore,Baoqi Jiang,...,Susy Ragazzini,Lutz Voigt,Tobias Scheffer,François Bodart,Yunhao Liu,Chai Quek,Kim Steenstrup Pedersen,Françoise Peyrin,Chiu-Lan Hsieh,Philip W. Trinder
,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Luigia Carlucci,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Vivian Lord,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dimitrij Surmeli,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C. R. Wan,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chai Quek,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
Kim Steenstrup Pedersen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
Françoise Peyrin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
Chiu-Lan Hsieh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
