Load data from dataset and preprocess it

In [1]:
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

#read train.json data
orign_train_data = pd.read_json('train.json')

#deep copy the original data
train_data = copy.deepcopy(orign_train_data)

#Get the authors list
train_data_authors = train_data['authors']

prolific_authors = []
coauthors = []
#Get the prolific authors list to train the model by removing the coauthors
for author in train_data_authors:
    p_authors = []
    np_authors = []
    for name in author:
        if name < 100:
            p_authors.append(name)
        else:
            np_authors.append(name)
    prolific_authors.append(p_authors)
    coauthors.append(np_authors)

#add the prolific authors list to the train data
train_data['coauthors'] = coauthors
train_data['prolific_authors'] = prolific_authors

#remove authors in the train data
train_data = train_data.drop(['authors'], axis=1)

p_a = train_data['prolific_authors']

#read test.json data
test_data = pd.read_json('test.json')

#Pack the prediction result into a csv file
def pack_result(result, file_name):
    result = pd.DataFrame(result)
    #change the 'ID' and 'Predict' column to int32
    result['ID'] = result['ID'].astype('int32')
    result['Predict'] = result['Predict'].astype('int32')
    result.columns = ['ID', 'Predict']
    result.to_csv(file_name, index=False)





Split the data into training and testing sets and make the no prolific paper has the -1 label

In [2]:

#Split the train data into training set and validation set
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['prolific_authors'],axis=1), p_a, test_size=0.2, random_state=42)

#get the same index entries in the original train data for training the model
X_train_orgin = orign_train_data.loc[X_train.index]

#re order the index of the X_train_orgin
X_train_orgin = X_train_orgin.reset_index(drop=True)

#re order the index of the y_train
y_train = y_train.reset_index(drop=True)

#re order the index of the X_test
X_test = X_test.reset_index(drop=True)

#re order the index of the y_test
y_test = y_test.reset_index(drop=True)

for i in range(len(y_train)):
    # if the author list is empty, add -1 to the list
    if len(y_train.iloc[i]) == 0:
        y_train.iloc[i].append(-1)

for i in range(len(y_test)):
    # if the author list is empty, add -1 to the list
    if len(y_test.iloc[i]) == 0:
        y_test.iloc[i].append(-1)



Functions that used to find information from the dataset

In [42]:
all_authors = X_train_orgin['authors']

#find authors that author x used to worked with and how many times
def find_author_x_work_with(author):
    author_x_work_with = {}
    author_x_paper = find_author_x_paper(author)
    for paper in author_x_paper:
        for ath in all_authors.loc[paper]:
            if ath != author:
                if ath in author_x_work_with:
                    author_x_work_with[ath] += 1
                else:
                    author_x_work_with[ath] = 1
    return author_x_work_with

#find which paper that author x participated
def find_author_x_paper(author):
    author_x_paper = []
    for i in range(len(all_authors)):
        if author in all_authors.loc[i]:
            author_x_paper.append(i)
    return author_x_paper

#find which paper that author x participated
def find_author_x_paper(author):
    author_x_paper = []
    for i in X_train_orgin.index:
        if author in X_train_orgin['authors'].loc[i]:
            author_x_paper.append(i)
    return author_x_paper

#find the venue that author published papers
def find_author_venue(author):
    venues = {}
    author_paper = find_author_x_paper(author)
    for paper in author_paper:
        venue = X_train_orgin.loc[paper]['venue']
        if venue in venues:
            venues[venue] += 1
        else:
            venues[venue] = 1
    return venues


#whether the author x and author y worked with each other
def is_author_x_work_with_author_y(author_x, author_y):
    author_x_paper = find_author_x_paper(author_x)
    author_y_paper = find_author_x_paper(author_y)
    for paper in author_x_paper:
        if paper in author_y_paper:
            return True
    return False
    
#find the prolific authors that the authors worked with (at least one of given authors worked with)
def get_authors_prolific_atleast_one(authors):
    prolific_authors = []
    for author in authors:
        prolific_authors.append(find_author_x_work_with(author))
    worked_authors = []
    for entry in prolific_authors:
        worked_authors.extend(entry.keys())

    #remove depulicate authors in worked_authors
    if worked_authors == []:
        return []
    worked_authors = list(set(worked_authors))
    #get the prolific authors that these authors all worked with
    #prolific_authors = set.union(*map(set, prolific_authors))

    p_a = []
    for author in worked_authors:
        if author < 100:
            p_a.append(author)
    return p_a

print(test_data['coauthors'].iloc[0])
print(find_author_x_paper(16336))

#check whether the author x ever published papers without using find_author_x_paper function
def is_author_x_published(author):

    for i in range(len(all_authors)):
        if author in X_train_orgin['authors'].loc[i]:
            return True
    return False

not_published = []
#see how many coauthor in the test data never published papers
for coauthor in test_data['coauthors'][0:100]:
    for author in coauthor:
        if not is_author_x_published(author):
            not_published.append(author)

print(not_published)


[16336, 1762, 4357, 12564]
[]
[16336, 1762, 4357, 12564, 21189, 794, 2749, 19810, 15307, 16229, 15313, 15688, 1130, 3077, 19345, 10600, 7441, 2853, 2408, 805, 19908, 914, 16964, 935, 6301, 8842, 6165, 12639, 7539, 15378, 12891, 18730, 957, 16102, 20460, 4932, 5264, 20327, 17607, 12279, 9658, 3417, 7711, 16036, 4644, 15298, 11297, 10012, 16034, 3785, 3631, 14489, 16229, 16227, 1707, 11281, 12795, 10012, 2614, 13576, 15689, 19182, 17872, 20760, 1980, 19354, 16560, 1075, 3977, 283, 13756, 8142, 7021, 14987, 15645, 7098, 14284, 18463, 12772, 4971, 4695, 14883, 16814, 1323, 19990, 17160, 19330, 13081, 1846, 3830, 12279, 7540, 17022, 540, 994, 14773, 5974, 7669, 21148, 4004, 17849, 972, 14388, 8425, 4790, 9248, 1474, 9647, 6461, 890, 19267, 16137, 18927]


Feature engineering process for the logistic regression model training

In [4]:
from sklearn.linear_model import LogisticRegression

def feature_engineering(train,test):
    new_featurs = {}

    #find the ratio of how many coauthors had worked with the prolific author
    new_featurs['work_ratio'] = []

    #see if the time of prolific author has worked with the author in the same venue
    new_featurs['same_venue'] = []

    #how many coauthors in the given data
    new_featurs['coauthors_count'] = []

    #how mant prolific authors that at least one of the coauthors has worked with
    new_featurs['prolific_count'] = []

    #whether this prolific author has participated in the same paper with the given coauthors
    new_featurs['is_p_author'] = []

    for i in range(len(train)):
        authors = train.iloc[i]['coauthors']

        #the paper that have only one author and it is also a prolific author will have no coauthors
        if len(authors) != 0:

            #create two dictionary to store the prolific authors and paper that the prolific authors have participated
            worked_authors_dic = {}
            published_paper_dic = {}
            true_lable = test.iloc[i]

            all_prolific_authors = get_authors_prolific_atleast_one(authors)

            #these two features are same for all the possible prolific authors in the given paper
            prolific_count = len(all_prolific_authors)
            coauthors_count = len(authors)

            if all_prolific_authors != []:
                for p_author in all_prolific_authors:
                    
                    all_p_authors_worked_with = find_author_x_work_with(p_author)
                    #copy the keys of the all_p_authors_worked_with
                    all_p_authors_worked_with_keys = copy.deepcopy(list(all_p_authors_worked_with.keys()))

                    for p_w_author in all_p_authors_worked_with_keys:
                        #remove the authors that the author have not worked with
                        if p_w_author not in authors:
                            all_p_authors_worked_with.pop(p_w_author)

                    worked_authors_dic[p_author] = all_p_authors_worked_with
            
            for wad in worked_authors_dic:
                work_ratio = len(worked_authors_dic[wad])/coauthors_count
                # print('author', wad, 'ratio', work_ratio)

                # venue that prolific author has published paper
                ven = find_author_venue(wad)

                #count the common venue
                same_count = 0
                for v in ven.keys():
                    paper_venue = train.iloc[i]['venue']
                    if v == paper_venue:
                        same_count += ven[v]

                #add the features to the new_featurs
                new_featurs['work_ratio'].append(work_ratio)
                new_featurs['same_venue'].append(same_count)
                new_featurs['coauthors_count'].append(coauthors_count)
                new_featurs['prolific_count'].append(prolific_count)
                if wad in true_lable:
                    new_featurs['is_p_author'].append(1)
                else:
                    new_featurs['is_p_author'].append(0)
            
            print('== ',i,'======================='*5)
    
    #return the new features as pandas dataframe
    new_featurs = pd.DataFrame(new_featurs)

    #Save the data into a json file
    new_featurs.to_json('new_featurs.json')

    return new_featurs

#create a logistic regression model
def logistic_regression_model(X_train, y_train, penalty='l2', C=1.0, solver='liblinear', max_iter=100, l1_ratio=None):
    logistic_regression = LogisticRegression()
    logistic_regression.fit(X_train, y_train)
    return logistic_regression


In [5]:
#remove all the , and [] in the given csv file
def csv_generate(csv_file,file_name):
    prediction = pd.read_csv(csv_file)

    #remove all the , and [] in the prediction1.csv
    prediction['Predict'] = prediction['Predict'].str.replace('[','')
    prediction['Predict'] = prediction['Predict'].str.replace(']','')
    prediction['Predict'] = prediction['Predict'].str.replace(',','')

    #save the prediction into a csv file
    prediction.to_csv(file_name, index=False)


In [13]:
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

new_features_1 = pd.read_json('features_data_0_5000.json')
new_features_2 = pd.read_json('features_data_5000_10000.json')
new_features_3 = pd.read_json('new_featurs_10000_20000.json')

new_features_origin = pd.concat([new_features_1, new_features_2, new_features_3], ignore_index=True)
#split the new_features where the is_p_author is 1 and 0
new_features_true = new_features_origin[new_features_origin['is_p_author'] == 1]
new_features_false = new_features_origin[new_features_origin['is_p_author'] == 0]


#add same number of false data to the true data (1:1)
training_data_1_1 = pd.concat([new_features_true, new_features_false.sample(n=len(new_features_true), random_state=1)], ignore_index=True)

#add twice as many false data to the true data (1:2)
training_data_1_2 = pd.concat([new_features_true, new_features_false.sample(n=len(new_features_true)*2, random_state=1)], ignore_index=True)

#add half as many false data to the true data (1:0.5)
training_data_1_05 = pd.concat([new_features_true, new_features_false.sample(n=int(len(new_features_true)/2), random_state=1)], ignore_index=True)



#shuffle the data
training_data_1_1 = shuffle(training_data_1_1)
training_data_1_2 = shuffle(training_data_1_2)
training_data_1_05 = shuffle(training_data_1_05)

#training the logistic regression model using the original data
X_train_lr = new_features_origin[['work_ratio', 'same_venue', 'coauthors_count', 'prolific_count']]
y_train_lr = new_features_origin['is_p_author']

#training the logistic regression model using the training_data_1_1
X_train_lr_11 = training_data_1_1[['work_ratio', 'same_venue', 'coauthors_count', 'prolific_count']]
y_train_lr_11 = training_data_1_1['is_p_author']

#training the logistic regression model using the training_data_1_2
X_train_lr_12 = training_data_1_2[['work_ratio', 'same_venue', 'coauthors_count', 'prolific_count']]
y_train_lr_12 = training_data_1_2['is_p_author']

#training the logistic regression model using the training_data_1_05
X_train_lr_105 = training_data_1_05[['work_ratio', 'same_venue', 'coauthors_count', 'prolific_count']]
y_train_lr_105 = training_data_1_05['is_p_author']


LR = logistic_regression_model(X_train_lr, y_train_lr)
LR11 = logistic_regression_model(X_train_lr_11, y_train_lr_11)
LR12 = logistic_regression_model(X_train_lr_12, y_train_lr_12)
LR105 = logistic_regression_model(X_train_lr_105, y_train_lr_105)

print(test_data.iloc[0])
#predict the test data
prediction = {}
prediction['ID'] = []
prediction['Predict'] = []

#remove all the rows that have no coauthors
print('the length of the test data before removing the rows that have no coauthors', len(X_test))
X_test = X_test[X_test['coauthors'].map(len) != 0]
#remove the same index in the y_test
y_test = y_test[X_test.index]
print('the length of the test data after removing the rows that have no coauthors', len(X_test))

start = 0
end = len(test_data)
data_to_predict = test_data[start:end]
label_to_predict = y_test[start:end]
select_model = LR105

for i in range(len(data_to_predict)):
    authors = data_to_predict.iloc[i]['coauthors']
    prediction['ID'].append(i)

    #all possible prolific authors of the paper
    apa = get_authors_prolific_atleast_one(authors)
    print('='*10, i, '='*10)
    if apa != []:
        prediction_ls = []
        for p_author in apa:
            ratio = 0
            venue_count = 0
            coauthors_count = len(authors)
            prolific_count = len(apa)

            #ratio calculation
            all_p_authors_worked_with = find_author_x_work_with(p_author)
            all_p_authors_worked_with_keys = copy.deepcopy(list(all_p_authors_worked_with.keys()))
            for p_w_author in all_p_authors_worked_with_keys:
                #remove the authors that not in the coauthors list
                if p_w_author not in authors:
                    all_p_authors_worked_with.pop(p_w_author)
            ratio = len(all_p_authors_worked_with)/coauthors_count
            print(p_author,'ratio: ',ratio)
            #venue calculation
            paper_venue = data_to_predict.iloc[i]['venue']
            p_author_venue = find_author_venue(p_author)
            if paper_venue in p_author_venue.keys():
                for key in p_author_venue.keys():
                    if key == paper_venue:
                        venue_count = venue_count + 1
                print(p_author,'venue_count: ',venue_count)
            else:
                venue_count = 0

            print(p_author,'venue_count: ',venue_count)
            result = select_model.predict([[ratio, venue_count, coauthors_count, prolific_count]])
            if result == [1]:
                print('the author is the possible author of the paper', p_author)
                print('ratio', ratio)
                prediction_ls.append(p_author)

        #if no prolific author is a good candidate, then predict no prolific author
        if prediction_ls != []: 
            prediction['Predict'].append(prediction_ls)
        else:
            prediction['Predict'].append([-1])

    else:
        prediction['Predict'].append([-1])

print(prediction['Predict'])

#calculate the accuracy using sklearn
prediction = pd.DataFrame(prediction)

#save the prediction into a csv file

prediction.to_csv('test_prediction.csv', index=False)

identifier                                                    0
coauthors                            [16336, 1762, 4357, 12564]
year                                                         19
abstract      [37, 1662, 3207, 10, 33, 2037, 1738, 1642, 155...
venue                                                       223
title         [3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,...
Name: 0, dtype: object
the length of the test data before removing the rows that have no coauthors 5079
the length of the test data after removing the rows that have no coauthors 5079
2 ratio:  0.5
2 venue_count:  0
6 ratio:  0.5
6 venue_count:  0


KeyboardInterrupt: 