Load data from dataset

In [None]:
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

#read train.json data
orign_train_data = pd.read_json('train.json')

#deep copy the original data
train_data = copy.deepcopy(orign_train_data)

#Get the authors list
train_data_authors = train_data['authors']

prolific_authors = []
coauthors = []
#Get the prolific authors list to train the model by removing the coauthors
for author in train_data_authors:
    p_authors = []
    np_authors = []
    for name in author:
        if name < 100:
            p_authors.append(name)
        else:
            np_authors.append(name)
    prolific_authors.append(p_authors)
    coauthors.append(np_authors)

#add the prolific authors list to the train data
train_data['coauthors'] = coauthors
train_data['prolific_authors'] = prolific_authors

#remove authors in the train data
train_data = train_data.drop(['authors'], axis=1)

p_a = train_data['prolific_authors']

#read test.json data
test_data = pd.read_json('test.json')

#Pack the prediction result into a csv file
def pack_result(result, file_name):
    result = pd.DataFrame(result)
    #change the 'ID' and 'Predict' column to int32
    result['ID'] = result['ID'].astype('int32')
    result['Predict'] = result['Predict'].astype('int32')
    result.columns = ['ID', 'Predict']
    result.to_csv(file_name, index=False)





Split the data into training and testing sets and make the no prolific paper has the -1 label

In [None]:

#Split the train data into training set and validation set
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['prolific_authors'],axis=1), p_a, test_size=0.2, random_state=42)

for i in range(len(y_train)):
    # if the author list is empty, add -1 to the list
    if len(y_train.iloc[i]) == 0:
        y_train.iloc[i].append(-1)

for i in range(len(y_test)):
    # if the author list is empty, add -1 to the list
    if len(y_test.iloc[i]) == 0:
        y_test.iloc[i].append(-1)


In [None]:
#split the training set into training set and validation set
X_train, X_val, y_train, y_val = train_test_split(train_data, y_train, test_size=0.2, random_state=42)

Functions that used to find essential information in the dataset

In [225]:
all_authors = orign_train_data['authors']

#find authors that author x used to worked with and how many times
def find_author_x_work_with(author):
    author_x_work_with = {}
    author_x_paper = find_author_x_paper(author)
    for paper in author_x_paper:
        for ath in all_authors.iloc[paper]:
            if ath != author:
                if ath in author_x_work_with:
                    author_x_work_with[ath] += 1
                else:
                    author_x_work_with[ath] = 1
    return author_x_work_with

#find which paper that author x participated
def find_author_x_paper(author):
    author_x_paper = []
    for i in range(len(all_authors)):
        if author in all_authors.iloc[i]:
            author_x_paper.append(i)
    return author_x_paper


#the years of author x collaborated with author y
def year_author_x_work_with_author_y(author_x, author_y):
    years = []
    author_x_paper = find_author_x_paper(author_x)
    author_y_paper = find_author_x_paper(author_y)
    for paper in author_x_paper:
        if paper in author_y_paper:
            years.append(orign_train_data.iloc[paper]['year'])
    # remove the duplicate years
    years = list(set(years))
    return years

#get the prolific authors that these authors all worked with
def get_prolific_authors(authors):
    prolific_authors = []
    for author in authors:
        prolific_authors.append(find_author_x_work_with(author))
    #get the prolific authors that these authors all worked with
    prolific_authors = set.intersection(*map(set, prolific_authors))

    p_a = []
    for author in prolific_authors:
        if author < 100:
            p_a.append(author)
    return p_a

#get all sub set of the authors
def get_all_sub_set(authors):
    all_sub_set = []
    for i in range(1, len(authors)):
        sub_set = []
        for j in range(len(authors)):
            sub_set.append(authors[j])
            if len(sub_set) == i:
                all_sub_set.append(sub_set)
                sub_set = []
    return all_sub_set

#Author anylysis
def author_analysis(author):
    author_x_work_with = find_author_x_work_with(author)
    author_x_paper = find_author_x_paper(author)
    author_x_work_with = sorted(author_x_work_with.items(), key=lambda x: x[1], reverse=True)

    print('Author', author, 'has', len(author_x_paper), 'papers')
    print('Author', author, 'has worked with', len(author_x_work_with), 'authors')
    print('Author', author, 'has worked with author - ', author_x_work_with[0][0], 'the most times\n')
    print('the paper that author', author, 'has participated', author_x_paper)
    
    print('--top 5 author that author(include non-prolific authors)', author, 'has worked with the most times--')

    print('the ratio of this author work with', author_x_work_with[0][0], 'is', author_x_work_with[0][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[0][1])
    print('the year that author', author, 'work with author', author_x_work_with[0][0], 'is', year_author_x_work_with_author_y(author, author_x_work_with[0][0]),'\n')
    print('the ratio of this author work with', author_x_work_with[1][0], 'is', author_x_work_with[1][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[1][1])
    print('the ratio of this author work with', author_x_work_with[2][0], 'is', author_x_work_with[2][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[2][1])
    print('the ratio of this author work with', author_x_work_with[3][0], 'is', author_x_work_with[3][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[3][1])
    print('the ratio of this author work with', author_x_work_with[4][0], 'is', author_x_work_with[4][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[4][1])

    print('Prolific authors that worked with and the time they worked togeth', author, '\n')
    for i in range(len(author_x_work_with)):
        if author_x_work_with[i][0] < 100:
            print('author id:',author_x_work_with[i][0],'  times:' ,author_x_work_with[i][1])
            print('the venue of author', author, 'work with author', author_x_work_with[i][0], 'is', year_author_x_work_with_author_y(author, author_x_work_with[i][0]),'\n')
            


Feature engineering process for the logistic regression model training

In [231]:
#if the author never work with prolific authors or the recent collaboration is less than 5 years, then predict -1

from sklearn.linear_model import LogisticRegression


authors = X_test.iloc[1]['coauthors']
print(authors)
print('get sub set of authors', get_all_sub_set(authors))
for sub_set in get_all_sub_set(authors):
    print('sub set', sub_set)
    print('sub_set length',len(sub_set),get_prolific_authors(sub_set))
    print()
# answers = y_test.iloc[0]
# print(X_test.iloc[0])
# print('the authors are', authors)
# print(get_prolific_authors(authors))
# print('the answer is', answers)

def rate_feature_engineering(train,test):
    for i in range(len(train)):
        authors = train.iloc[i]['coauthors']
        #the paper that have only one author and it is also a prolific author will have no coauthors
        if len(authors) != 0:
            possible_prolific_authors = {}
            sub_sets = get_all_sub_set(authors)
            for sub_set in sub_sets:
                prolific_authors = get_prolific_authors(sub_set)
                for prolific_author in prolific_authors:
                    p_rate = len(sub_set)/len(authors)
                    if prolific_author not in possible_prolific_authors:
                        possible_prolific_authors[prolific_author] = p_rate
                    else:
                        if possible_prolific_authors[prolific_author] < p_rate:
                            possible_prolific_authors[prolific_author] = p_rate
                    
                possible_prolific_authors.append(get_prolific_authors(sub_set))

def feature_engineering(train,test):
    new_featurs = {}
    #how many authors that a prolific author has worked witha in the same paper
    new_featurs['rate'] = []
    #how many paper that the prolific author has published on the given venue with the respect to the all papers that the prolific author has published
    new_featurs['same_venue'] = []
    #the count of total collaboration of prolific author with all the coauthors
    new_featurs['total_col'] = []
    #how many coauthors in the given data
    new_featurs['coauthors_count'] = []
    new_featurs['is_author'] = []

    for i in range(len(train)):
        authors = train.iloc[i]['coauthors']
        #the paper that have only one author and it is also a prolific author will have no coauthors
        if len(authors) != 0:
            possible_prolific_authors = {}
            sub_sets = get_all_sub_set(authors)
            for sub_set in sub_sets:
                prolific_authors = get_prolific_authors(sub_set)
                for prolific_author in prolific_authors:
                    p_rate = len(sub_set)/len(authors)
                    if prolific_author not in possible_prolific_authors:
                        possible_prolific_authors[prolific_author] = p_rate
                    else:
                        if possible_prolific_authors[prolific_author] < p_rate:
                            possible_prolific_authors[prolific_author] = p_rate
                    
                possible_prolific_authors.append(get_prolific_authors(sub_set))

    return pd.DataFrame(new_featurs)

#create a logistic regression model
def logistic_regression_model(X_train, y_train, penalty='l2', C=1.0, solver='liblinear', max_iter=100, l1_ratio=None):
    logistic_regression = LogisticRegression(penalty, C, solver, max_iter, l1_ratio)
    logistic_regression.fit(X_train, y_train)
    return logistic_regression

# Deternqmine whether the author is the author of the paper using logistic regression
def is_prolific_author(author,paper_info):
    authors = paper_info['coauthors']
    venue = paper_info['venue']
    
    return 1


[14034, 7787, 17767, 12861, 9924, 7810]
get sub set of authors [[14034], [7787], [17767], [12861], [9924], [7810], [14034, 7787], [17767, 12861], [9924, 7810], [14034, 7787, 17767], [12861, 9924, 7810], [14034, 7787, 17767, 12861], [14034, 7787, 17767, 12861, 9924]]
sub set [14034]
sub_set length 1 [41]

sub set [7787]
sub_set length 1 []

sub set [17767]
sub_set length 1 []

sub set [12861]
sub_set length 1 [44]

sub set [9924]
sub_set length 1 []

sub set [7810]
sub_set length 1 [7, 19, 41, 44]

sub set [14034, 7787]
sub_set length 2 []

sub set [17767, 12861]
sub_set length 2 []

sub set [9924, 7810]
sub_set length 2 []

sub set [14034, 7787, 17767]
sub_set length 3 []

sub set [12861, 9924, 7810]
sub_set length 3 []

sub set [14034, 7787, 17767, 12861]
sub_set length 4 []

sub set [14034, 7787, 17767, 12861, 9924]
sub_set length 5 []



In [None]:
lr_training = feature_engineering(X_train, y_train).drop(['is_author'], axis=1)
lr_testing = feature_engineering(X_test, y_test)['is_author']

#create the logistic regression model
LR = logistic_regression_model(lr_training, lr_testing)

In [233]:
print(test_data.iloc[0])
print('prolific authors', get_prolific_authors(test_data.iloc[0]['coauthors']))

identifier                                                    0
coauthors                            [16336, 1762, 4357, 12564]
year                                                         19
abstract      [37, 1662, 3207, 10, 33, 2037, 1738, 1642, 155...
venue                                                       223
title         [3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,...
Name: 0, dtype: object
prolific authors [92]
