In [1]:
import pandas as pd
import collections
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english')) 
punc = set([',', '-', '/', '.', '  ', 'university', 'research', 'center', 'institute'])
stop_words.update(punc)


# creating fuzzy string match function

In [3]:
from fuzzywuzzy import fuzz

key = 'University of Illinois Urbana - Champaign'
value = 'University of Illinois at Urbana, Champaign'

def clean(name):
    name = name.lower()
    clean_name = []
    tok = word_tokenize(name)
    for token in tok:
        if token in stop_words:
            continue
        else:
            clean_name.append(token)
    return " ".join(clean_name)


def fuzzy(key, value):
    key = clean(key)
    value = clean(value)
    fuzz_score = fuzz.token_set_ratio(key, value)
    return fuzz_score

fuzzy('University of British Columbia', 'U British Columbia')



100

# Reading all the data

In [4]:
#creating a csv file to contain all the conferences TPC details

ipsn_tpc = pd.read_csv('data/Clean_TPC/ipsn_clean.csv')
ipsn_auth = pd.read_csv('data/Clean_Authors/ipsn_authors.csv')

mobicom_tpc = pd.read_csv('data/Clean_TPC/mobicom_clean.csv')
mobicom_auth = pd.read_csv('data/Clean_Authors/mobicom_authors.csv')

mobihoc_tpc = pd.read_csv('data/Clean_TPC/mobihoc_clean.csv')
mobihoc_auth = pd.read_csv('data/Clean_Authors/mobihoc_authors.csv')

sensys_tpc = pd.read_csv('data/Clean_TPC/sensys_clean.csv')
sensys_auth = pd.read_csv('data/Clean_Authors/sensys_authors.csv')

sigcomm_tpc = pd.read_csv('data/Clean_TPC/sigcomm_clean.csv')
sigcomm_auth = pd.read_csv('data/Clean_Authors/sigcomm_authors.csv')

In [42]:
def tpc_auth_pub(author_df, tpc_df):
    "Given the auth and tpc df for a conference returns "
    
    tpc_count = collections.Counter(tpc_df['University/Organization'])
    tpc_count_df = pd.DataFrame(list(zip(tpc_count.keys(), tpc_count.values())), columns = ['University/Organization', 'No_of_tpc'])

    auth_count = collections.Counter(author_df['University/Organization'])
    auth_count_df = pd.DataFrame(list(zip(auth_count.keys(), auth_count.values())), columns = ['University/Organization', 'No_of_authors'])

    paper_uni = set()
    uni_paper_dict = collections.defaultdict(int)
    for index in range(len(ipsn_auth)):
        uni = ipsn_auth['University/Organization'][index]
        paper = ipsn_auth['Paper_title'][index]

        paper_uni.add((uni, paper))

    for pap_uni in paper_uni:
        university, paper = pap_uni
        if university not in uni_paper_dict.keys():
            uni_paper_dict[university] = 1
        elif university in uni_paper_dict.keys():
            uni_paper_dict[university] += 1

    pub_count = pd.DataFrame(list(zip(uni_paper_dict.keys(), uni_paper_dict.values())), columns = ['University/Organization', 'No_of_publications'])
    
    tpc_auth = pd.merge(left = tpc_count_df, right = auth_count_df, left_on='University/Organization', right_on='University/Organization')
    
    final_df = pd.merge(left = tpc_auth, right = pub_count, left_on='University/Organization', right_on='University/Organization')
    
    return final_df

In [46]:
ipsn_bubble = tpc_auth_pub(ipsn_auth, ipsn_tpc)
ipsn_bubble.to_csv('data/Bubble/ipsn_bubble.csv', header = True, index = None)

In [47]:
sensys_bubble = tpc_auth_pub(sensys_auth, sensys_tpc)
sensys_bubble.to_csv('data/Bubble/sensys_bubble.csv', header = True, index = None)

In [48]:
sigcomm_bubble = tpc_auth_pub(sigcomm_auth, sigcomm_tpc)
sigcomm_bubble.to_csv('data/Bubble/sigcomm_bubble.csv', header = True, index = None)

In [49]:
mobicom_bubble = tpc_auth_pub(mobicom_auth, mobicom_tpc)
mobicom_bubble.to_csv('data/Bubble/mobicom_bubble.csv', header = True, index = None)

In [50]:
mobihoc_bubble = tpc_auth_pub(mobihoc_auth, mobihoc_tpc)
mobihoc_bubble.to_csv('data/Bubble/mobihoc_bubble.csv', header = True, index = None)