# Import dataset and package

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import random
random.seed(10)

import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import spacy
from wordcloud import WordCloud

In [2]:
citation = pd.read_csv("clean_data.csv")

In [3]:
citation.head()

Unnamed: 0,title,authors,year,venue,id,references,abstract,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT,MAIN_AUTHOR,SECONDARY_AUTHORS,TITLE_TOKEN,ABSTRACT_TOKEN
0,OQLC++ Extending C++ with an Object Query Capa...,['José A. Blakeley'],1995,Modern Database Systems,0,[],,5,1,0,José A. Blakeley,[],"['oqlc++', 'extending', 'c++', 'with', 'an', '...",[]
1,Transaction Management in Multidatabase Systems,"['Yuri Breitbart', 'Hector Garcia-Molina', 'Ab...",1995,Modern Database Systems,1,[],,0,3,0,Yuri Breitbart,"['Hector Garcia-Molina', 'Abraham Silberschatz']","['transaction', 'management', 'in', 'multidata...",[]
2,Overview of the ADDS System,"['Yuri Breitbart', 'Tom C. Reyes']",1995,Modern Database Systems,2,[],,0,2,0,Yuri Breitbart,['Tom C. Reyes'],"['overview', 'of', 'the', 'adds', 'system']",[]
3,Multimedia Information Systems Issues and Appr...,"['Stavros Christodoulakis', 'Leonidas Koveos']",1995,Modern Database Systems,3,[],,2,2,0,Stavros Christodoulakis,['Leonidas Koveos'],"['multimedia', 'information', 'systems', 'issu...",[]
4,Active Database Systems,"['Umeshwar Dayal', 'Eric N. Hanson', 'Jennifer...",1995,Modern Database Systems,4,['995520'],,16,3,1,Umeshwar Dayal,"['Eric N. Hanson', 'Jennifer Widom']","['active', 'database', 'systems']",[]


In [4]:
citation.shape

(1083641, 14)

In [5]:
citation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1083641 entries, 0 to 1083640
Data columns (total 14 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   title              1083606 non-null  object
 1   authors            1083641 non-null  object
 2   year               1083641 non-null  int64 
 3   venue              1083107 non-null  object
 4   id                 1083641 non-null  int64 
 5   references         1083641 non-null  object
 6   abstract           294358 non-null   object
 7   ISCITED_COUNT      1083641 non-null  int64 
 8   AUTHOR_COUNT       1083641 non-null  int64 
 9   REF_COUNT          1083641 non-null  int64 
 10  MAIN_AUTHOR        1074140 non-null  object
 11  SECONDARY_AUTHORS  1083641 non-null  object
 12  TITLE_TOKEN        1083641 non-null  object
 13  ABSTRACT_TOKEN     1083641 non-null  object
dtypes: int64(5), object(9)
memory usage: 115.7+ MB


In [6]:
citation["ISCITED_COUNT"].describe()

count    1.083641e+06
mean     1.562385e+00
std      1.122518e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      4.474000e+03
Name: ISCITED_COUNT, dtype: float64

In [10]:
# Calculate sample size
# alpha = 0.05
# margin of error = 0.2
std = citation["ISCITED_COUNT"].describe()["std"]
n = st.norm.ppf(.975)**2 * std**2 / 0.2**2
print("Sample size:", round(n))

Sample size: 12101


In [11]:
np.random.seed(10)
rand_index = np.random.randint(0,citation.shape[0],size=round(n))

In [12]:
rand_index

array([1048548,  617841,  804336, ...,  879802,  303645,  217263])

In [13]:
citation_sample = citation.loc[rand_index]

In [14]:
citation_sample.reset_index(inplace = True)
citation_sample.to_csv("citation_sample.csv", index = False)

In [15]:
def categorize_citations(citation_count):
    if citation_count == 0:
        return 'Seldom Cited'
    elif 0 < citation_count <= 10:
        return 'Moderately Cited'
    elif 10 < citation_count <= 100:
        return 'Frequently Cited'
    else:
        return 'Highly Cited'

In [16]:
citation_sample

Unnamed: 0,index,title,authors,year,venue,id,references,abstract,ISCITED_COUNT,AUTHOR_COUNT,REF_COUNT,MAIN_AUTHOR,SECONDARY_AUTHORS,TITLE_TOKEN,ABSTRACT_TOKEN
0,1048548,Securing communication using function extracti...,['K. Vimal Kumar'],2009,Computers Security,1587640,[],,0,1,0,K. Vimal Kumar,[],"['securing', 'communication', 'using', 'functi...",[]
1,617841,Compressing Data Cube in Parallel OLAP Systems,"['Frank K. H. A. Dehne', 'Todd Eavis', 'Boyong...",2007,Data Science Journal,839971,[],,0,3,0,Frank K. H. A. Dehne,"['Todd Eavis', 'Boyong Liang']","['compressing', 'data', 'cube', 'in', 'paralle...",[]
2,804336,Distributed Resource Administration Using Cfen...,"['Mark Burgess', 'Ricky Ralston']",1997,Softw Pract Exper,1073637,[],,15,2,0,Mark Burgess,['Ricky Ralston'],"['distributed', 'resource', 'administration', ...",[]
3,960506,Existence of positive solutions of BVPs for se...,"['Wan-Tong Li', 'Ming-Fei Niu', 'Jian-Ping Sun']",2004,Applied Mathematics and Computation,1280289,[],,0,3,0,Wan-Tong Li,"['Ming-Fei Niu', 'Jian-Ping Sun']","['existence', 'of', 'positive', 'solutions', '...",[]
4,710912,Integrating Searching and Authoring in Mizar,"['Paul A. Cairns', 'Jeremy Gow']",2007,J Autom Reasoning,955214,"['595127', '954818', '612606', '893021', '5009...",The vision of a computerized assistant to math...,0,2,10,Paul A. Cairns,['Jeremy Gow'],"['integrating', 'searching', 'and', 'authoring...","['the', 'vision', 'of', 'a', 'computerized', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12096,894852,Editorial,['Richard Zurawski'],2007,IEEE Trans Industrial Informatics,1187416,[],,0,1,0,Richard Zurawski,[],['editorial'],[]
12097,662346,HDR Image Compression by Local Adaptation for ...,"['Lijie Wang', 'Takahiko Horiuchi', 'Hiroaki K...",2007,IEICE Transactions,896194,[],Adaptation process of retina helps human visua...,0,3,0,Lijie Wang,"['Takahiko Horiuchi', 'Hiroaki Kotera']","['hdr', 'image', 'compression', 'by', 'local',...","['adaptation', 'process', 'of', 'retina', 'hel..."
12098,879802,Quantifying and setting off network performance,['Erik Hofmann'],2006,IJNVO,1170232,[],While many aspects of the network and the coll...,0,1,0,Erik Hofmann,[],"['quantifying', 'and', 'setting', 'off', 'netw...","['while', 'many', 'aspects', 'of', 'the', 'net..."
12099,303645,HardwareMeasurements of Storage Access Conflic...,"['Uwe Hercksen', 'Rainer Klar', 'Wolfgang Klei...",1980,ISCA,418893,[],,0,3,0,Uwe Hercksen,"['Rainer Klar', 'Wolfgang Kleinöder']","['hardwaremeasurements', 'of', 'storage', 'acc...",[]


In [17]:
cite_counts = citation_sample["ISCITED_COUNT"].apply(categorize_citations)

In [18]:
cite_counts.value_counts()

ISCITED_COUNT
Seldom Cited        9417
Moderately Cited    2317
Frequently Cited     354
Highly Cited          13
Name: count, dtype: int64

In [1]:
sel_cite_prop = 9417/(9417+2317+354+13)
mod_cite_prop = 2317/(9417+2317+354+13)
fre_cite_prop = 354/(9417+2317+354+13)
high_cite_prop = 13/(9417+2317+354+13)
print(sel_cite_prop, mod_cite_prop, fre_cite_prop, high_cite_prop)

0.7782001487480373 0.19147177919180233 0.029253780679282703 0.0010742913808776135


In [35]:
cite_counts_full = citation["ISCITED_COUNT"].apply(categorize_citations)

In [36]:
cite_counts_full.value_counts()

ISCITED_COUNT
Seldom Cited        840792
Moderately Cited    208274
Frequently Cited     33213
Highly Cited          1362
Name: count, dtype: int64

In [40]:
sel_cite_prop = 840792/(840792+208274+33213+1362)
mod_cite_prop = 208274/(840792+208274+33213+1362)
fre_cite_prop = 33213/(840792+208274+33213+1362)
high_cite_prop = 1362/(840792+208274+33213+1362)
print(sel_cite_prop, mod_cite_prop, fre_cite_prop, high_cite_prop)

0.7758953380316913 0.19219833874871844 0.03064944940252353 0.0012568738170667224
