Load data from dataset

In [4]:
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
from colabcode import ColabCode

warnings.filterwarnings('ignore')

#read train.json data
orign_train_data = pd.read_json('train.json')

#deep copy the original data
train_data = copy.deepcopy(orign_train_data)

#Get the authors list
train_data_authors = train_data['authors']

prolific_authors = []
coauthors = []
#Get the prolific authors list to train the model by removing the coauthors
for author in train_data_authors:
    p_authors = []
    np_authors = []
    for name in author:
        if name < 100:
            p_authors.append(name)
        else:
            np_authors.append(name)
    prolific_authors.append(p_authors)
    coauthors.append(np_authors)

#add the prolific authors list to the train data
train_data['coauthors'] = coauthors
train_data['prolific_authors'] = prolific_authors

#remove authors in the train data
train_data = train_data.drop(['authors'], axis=1)

p_a = train_data['prolific_authors']

#read test.json data
test_data = pd.read_json('test.json')

#Pack the prediction result into a csv file
def pack_result(result, file_name):
    result = pd.DataFrame(result)
    #change the 'ID' and 'Predict' column to int32
    result['ID'] = result['ID'].astype('int32')
    result['Predict'] = result['Predict'].astype('int32')
    result.columns = ['ID', 'Predict']
    result.to_csv(file_name, index=False)





Split the data into training and testing sets and make the no prolific paper has the -1 label

In [13]:

#Split the train data into training set and validation set
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['prolific_authors'],axis=1), p_a, test_size=0.2, random_state=42)

#print the index of the training set
print(X_train.index)

#get the same index entries in the original train data
X_train_orgin = orign_train_data.loc[X_train.index]

for i in range(len(y_train)):
    # if the author list is empty, add -1 to the list
    if len(y_train.iloc[i]) == 0:
        y_train.iloc[i].append(-1)

for i in range(len(y_test)):
    # if the author list is empty, add -1 to the list
    if len(y_test.iloc[i]) == 0:
        y_test.iloc[i].append(-1)




Int64Index([15889,  1205, 11909,  7138, 24813, 11313, 19843, 18356, 24884,
             6921,
            ...
            16850,  6265, 22118, 11284, 11964, 21575,  5390,   860, 15795,
            23654],
           dtype='int64', length=20634)


Functions that used to find essential information in the dataset

In [9]:
all_authors = orign_train_data['authors']

#find authors that author x used to worked with and how many times
def find_author_x_work_with(author):
    author_x_work_with = {}
    author_x_paper = find_author_x_paper(author)
    for paper in author_x_paper:
        for ath in all_authors.iloc[paper]:
            if ath != author:
                if ath in author_x_work_with:
                    author_x_work_with[ath] += 1
                else:
                    author_x_work_with[ath] = 1
    return author_x_work_with

#find which paper that author x participated
def find_author_x_paper(author):
    author_x_paper = []
    for i in range(len(all_authors)):
        if author in all_authors.iloc[i]:
            author_x_paper.append(i)
    return author_x_paper

#whether the author x and author y worked with each other
def is_author_x_work_with_author_y(author_x, author_y):
    author_x_paper = find_author_x_paper(author_x)
    author_y_paper = find_author_x_paper(author_y)
    for paper in author_x_paper:
        if paper in author_y_paper:
            return True
    return False

#the years of author x collaborated with author y
def year_author_x_work_with_author_y(author_x, author_y):
    years = []
    author_x_paper = find_author_x_paper(author_x)
    author_y_paper = find_author_x_paper(author_y)
    for paper in author_x_paper:
        if paper in author_y_paper:
            years.append(orign_train_data.iloc[paper]['year'])
    # remove the duplicate years
    years = list(set(years))
    return years

#get the prolific authors that these authors all worked with
def get_prolific_authors(authors):
    prolific_authors = []
    for author in authors:
        prolific_authors.append(find_author_x_work_with(author))
    #get the prolific authors that these authors all worked with
    prolific_authors = set.intersection(*map(set, prolific_authors))

    p_a = []
    for author in prolific_authors:
        if author < 100:
            p_a.append(author)
    return p_a
#find the prolific authors that the authors worked with (at least one of given authors worked with)
def get_authors_prolific_atleast_one(authors):
    prolific_authors = []
    for author in authors:
        prolific_authors.append(find_author_x_work_with(author))
    #get the prolific authors that these authors all worked with
    prolific_authors = set.union(*map(set, prolific_authors))

    p_a = []
    for author in prolific_authors:
        if author < 100:
            p_a.append(author)
    return p_a

#get all sub set of the given number list
def get_all_sub_set(nums):
    result = [[]]
    for num in nums:
        result += [item + [num] for item in result]
    # remove null set
    result.remove([])
    return result

#Author anylysis
def author_analysis(author):
    author_x_work_with = find_author_x_work_with(author)
    author_x_paper = find_author_x_paper(author)
    author_x_work_with = sorted(author_x_work_with.items(), key=lambda x: x[1], reverse=True)

    print('Author', author, 'has', len(author_x_paper), 'papers')
    print('Author', author, 'has worked with', len(author_x_work_with), 'authors')
    print('Author', author, 'has worked with author - ', author_x_work_with[0][0], 'the most times\n')
    print('the paper that author', author, 'has participated', author_x_paper)
    
    print('--top 5 author that author(include non-prolific authors)', author, 'has worked with the most times--')

    print('the ratio of this author work with', author_x_work_with[0][0], 'is', author_x_work_with[0][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[0][1])
    print('the year that author', author, 'work with author', author_x_work_with[0][0], 'is', year_author_x_work_with_author_y(author, author_x_work_with[0][0]),'\n')
    print('the ratio of this author work with', author_x_work_with[1][0], 'is', author_x_work_with[1][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[1][1])
    print('the ratio of this author work with', author_x_work_with[2][0], 'is', author_x_work_with[2][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[2][1])
    print('the ratio of this author work with', author_x_work_with[3][0], 'is', author_x_work_with[3][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[3][1])
    print('the ratio of this author work with', author_x_work_with[4][0], 'is', author_x_work_with[4][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[4][1])

    print('Prolific authors that worked with and the time they worked togeth', author, '\n')
    for i in range(len(author_x_work_with)):
        if author_x_work_with[i][0] < 100:
            print('author id:',author_x_work_with[i][0],'  times:' ,author_x_work_with[i][1])
            print('the venue of author', author, 'work with author', author_x_work_with[i][0], 'is', year_author_x_work_with_author_y(author, author_x_work_with[i][0]),'\n')
            


In [7]:
print(len(get_all_sub_set([1,2,3,4])))
print(len(get_all_sub_set([1,2,3,4,5])))

15
31


Feature engineering process for the logistic regression model training

In [8]:
from sklearn.linear_model import LogisticRegression

def feature_engineering(train,test):
    new_featurs = {}
    #how many authors that a prolific author has worked witha in the same paper
    new_featurs['rate'] = []
    #how many paper that the prolific author has published on the given venue with the respect to the all papers that the prolific author has published
    new_featurs['same_venue'] = []
    #the count of total collaboration of prolific author with all the coauthors
    new_featurs['total_col'] = []
    #how many coauthors in the given data
    new_featurs['coauthors_count'] = []
    new_featurs['is_author'] = []

    for i in range(len(train)):
        authors = train.iloc[i]['coauthors']
        #the paper that have only one author and it is also a prolific author will have no coauthors
        if len(authors) != 0:
            possible_prolific_authors_rate = {}
            all_prolific_authors = get_authors_prolific_atleast_one(authors)


            #make sure don't add empty rows to the new features
            if len(possible_prolific_authors_rate) != 0:
                for prolific_author in possible_prolific_authors_rate.keys():
                    new_featurs['rate'].append(possible_prolific_authors_rate[prolific_author])
                    new_featurs['same_venue'].append(0)
                    new_featurs['total_col'].append(0)
                    new_featurs['coauthors_count'].append(len(authors))
                    new_featurs['is_author'].append(1)

            print('possible_prolific_authors_rate', possible_prolific_authors_rate)
        print('the i entry has been featured', i)

    new_featurs = pd.DataFrame(new_featurs)
    return possible_prolific_authors_rate

#create a logistic regression model
def logistic_regression_model(X_train, y_train, penalty='l2', C=1.0, solver='liblinear', max_iter=100, l1_ratio=None):
    logistic_regression = LogisticRegression(penalty, C, solver, max_iter, l1_ratio)
    logistic_regression.fit(X_train, y_train)
    return logistic_regression

# Determine whether the author is the author of the paper using logistic regression
def is_prolific_author(author,paper_info):
    authors = paper_info['coauthors']
    venue = paper_info['venue']
    return 1

#
    


possible_prolific_authors_rate {70: 0.5, 87: 0.5, 39: 0.5}
the i entry has been featured 0
possible_prolific_authors_rate {}
the i entry has been featured 1
possible_prolific_authors_rate {36: 0.5, 10: 0.5}
the i entry has been featured 2
possible_prolific_authors_rate {32: 0.3333333333333333, 59: 0.6666666666666666, 6: 0.3333333333333333, 17: 0.3333333333333333, 18: 0.3333333333333333, 36: 0.3333333333333333, 48: 0.3333333333333333, 49: 0.3333333333333333, 51: 0.3333333333333333, 58: 0.3333333333333333, 57: 0.3333333333333333, 62: 0.3333333333333333, 72: 0.3333333333333333, 74: 0.3333333333333333, 78: 0.3333333333333333, 81: 0.3333333333333333, 83: 0.3333333333333333, 84: 0.3333333333333333, 92: 0.3333333333333333, 99: 0.3333333333333333, 70: 0.3333333333333333}
the i entry has been featured 3
possible_prolific_authors_rate {6: 0.5, 8: 0.5, 14: 0.5, 29: 0.5, 32: 0.5, 37: 0.5, 51: 0.5, 57: 0.5, 59: 0.5, 60: 0.5, 62: 0.5, 69: 0.5, 73: 0.5, 87: 0.5}
the i entry has been featured 4
possib

KeyboardInterrupt: 