Load data from dataset and preprocess it

In [2]:
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

#read train.json data
orign_train_data = pd.read_json('train.json')

#deep copy the original data
train_data = copy.deepcopy(orign_train_data)

#Get the authors list
train_data_authors = train_data['authors']

prolific_authors = []
coauthors = []
#Get the prolific authors list to train the model by removing the coauthors
for author in train_data_authors:
    p_authors = []
    np_authors = []
    for name in author:
        if name < 100:
            p_authors.append(name)
        else:
            np_authors.append(name)
    prolific_authors.append(p_authors)
    coauthors.append(np_authors)

#add the prolific authors list to the train data
train_data['coauthors'] = coauthors
train_data['prolific_authors'] = prolific_authors

#remove authors in the train data
train_data = train_data.drop(['authors'], axis=1)

p_a = train_data['prolific_authors']

#read test.json data
test_data = pd.read_json('test.json')

#Pack the prediction result into a csv file
def pack_result(result, file_name):
    result = pd.DataFrame(result)
    #change the 'ID' and 'Predict' column to int32
    result['ID'] = result['ID'].astype('int32')
    result['Predict'] = result['Predict'].astype('int32')
    result.columns = ['ID', 'Predict']
    result.to_csv(file_name, index=False)





Split the data into training and testing sets and make the no prolific paper has the -1 label

In [110]:

#Split the train data into training set and validation set
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(['prolific_authors'],axis=1), p_a, test_size=0.2, random_state=42)

#get the same index entries in the original train data for training the model
X_train_orgin = orign_train_data.loc[X_train.index]

#re order the index of the X_train_orgin
X_train_orgin = X_train_orgin.reset_index(drop=True)

#re order the index of the y_train
y_train = y_train.reset_index(drop=True)

for i in range(len(y_train)):
    # if the author list is empty, add -1 to the list
    if len(y_train.iloc[i]) == 0:
        y_train.iloc[i].append(-1)

for i in range(len(y_test)):
    # if the author list is empty, add -1 to the list
    if len(y_test.iloc[i]) == 0:
        y_test.iloc[i].append(-1)




Functions that used to find information from the dataset

In [111]:
all_authors = X_train_orgin['authors']

#find authors that author x used to worked with and how many times
def find_author_x_work_with(author):
    author_x_work_with = {}
    author_x_paper = find_author_x_paper(author)
    for paper in author_x_paper:
        for ath in all_authors.loc[paper]:
            if ath != author:
                if ath in author_x_work_with:
                    author_x_work_with[ath] += 1
                else:
                    author_x_work_with[ath] = 1
    return author_x_work_with

#find which paper that author x participated
def find_author_x_paper(author):
    author_x_paper = []
    for i in range(len(all_authors)):
        if author in all_authors.loc[i]:
            author_x_paper.append(i)
    return author_x_paper

#find the venue that author published papers
def find_author_venue(author):
    venues = {}
    author_paper = find_author_x_paper(author)
    for paper in author_paper:
        venue = X_train_orgin.loc[paper]['venue']
        if venue in venues:
            venues[venue] += 1
        else:
            venues[venue] = 1
    return venues


#whether the author x and author y worked with each other
def is_author_x_work_with_author_y(author_x, author_y):
    author_x_paper = find_author_x_paper(author_x)
    author_y_paper = find_author_x_paper(author_y)
    for paper in author_x_paper:
        if paper in author_y_paper:
            return True
    return False
    
#find the prolific authors that the authors worked with (at least one of given authors worked with)
def get_authors_prolific_atleast_one(authors):
    prolific_authors = []
    for author in authors:
        prolific_authors.append(find_author_x_work_with(author))
    #get the prolific authors that these authors all worked with
    prolific_authors = set.union(*map(set, prolific_authors))

    p_a = []
    for author in prolific_authors:
        if author < 100:
            p_a.append(author)
    return p_a

#Author anylysis
# def author_analysis(author):
#     author_x_work_with = find_author_x_work_with(author)
#     author_x_paper = find_author_x_paper(author)
#     author_x_work_with = sorted(author_x_work_with.items(), key=lambda x: x[1], reverse=True)

#     print('Author', author, 'has', len(author_x_paper), 'papers')
#     print('Author', author, 'has worked with', len(author_x_work_with), 'authors')
#     print('Author', author, 'has worked with author - ', author_x_work_with[0][0], 'the most times\n')
#     print('the paper that author', author, 'has participated', author_x_paper)
    
#     print('--top 5 author that author(include non-prolific authors)', author, 'has worked with the most times--')

#     print('the ratio of this author work with', author_x_work_with[0][0], 'is', author_x_work_with[0][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[0][1])
#     print('the year that author', author, 'work with author', author_x_work_with[0][0], 'is', year_author_x_work_with_author_y(author, author_x_work_with[0][0]),'\n')
#     print('the ratio of this author work with', author_x_work_with[1][0], 'is', author_x_work_with[1][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[1][1])
#     print('the ratio of this author work with', author_x_work_with[2][0], 'is', author_x_work_with[2][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[2][1])
#     print('the ratio of this author work with', author_x_work_with[3][0], 'is', author_x_work_with[3][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[3][1])
#     print('the ratio of this author work with', author_x_work_with[4][0], 'is', author_x_work_with[4][1]/len(author_x_paper), 'the number of collaboration is', author_x_work_with[4][1])

#     print('Prolific authors that worked with and the time they worked togeth', author, '\n')
#     for i in range(len(author_x_work_with)):
#         if author_x_work_with[i][0] < 100:
#             print('author id:',author_x_work_with[i][0],'  times:' ,author_x_work_with[i][1])
#             print('the venue of author', author, 'work with author', author_x_work_with[i][0], 'is', year_author_x_work_with_author_y(author, author_x_work_with[i][0]),'\n')
            


Feature engineering process for the logistic regression model training

In [117]:
from sklearn.linear_model import LogisticRegression

def feature_engineering(train,test):
    new_featurs = {}

    #find the ratio of how many coauthors had worked with the prolific author
    new_featurs['work_ratio'] = []

    #see if the time of prolific author has worked with the author in the same venue
    new_featurs['same_venue'] = []

    #how many coauthors in the given data
    new_featurs['coauthors_count'] = []

    #how mant prolific authors that at least one of the coauthors has worked with
    new_featurs['prolific_count'] = []

    #whether this prolific author has participated in the same paper with the given coauthors
    new_featurs['is_p_author'] = []

    for i in range(len(train)):
        authors = train.iloc[i]['coauthors']

        #the paper that have only one author and it is also a prolific author will have no coauthors
        if len(authors) != 0:

            #create two dictionary to store the prolific authors and paper that the prolific authors have participated
            worked_authors_dic = {}
            published_paper_dic = {}
            true_lable = test.iloc[i]

            all_prolific_authors = get_authors_prolific_atleast_one(authors)

            #these two features are same for all the possible prolific authors in the given paper
            prolific_count = len(all_prolific_authors)
            coauthors_count = len(authors)

            if all_prolific_authors != []:
                for p_author in all_prolific_authors:
                    
                    all_p_authors_worked_with = find_author_x_work_with(p_author)
                    #copy the keys of the all_p_authors_worked_with
                    all_p_authors_worked_with_keys = copy.deepcopy(list(all_p_authors_worked_with.keys()))

                    for p_w_author in all_p_authors_worked_with_keys:
                        #remove the authors that the author have not worked with
                        if p_w_author not in authors:
                            all_p_authors_worked_with.pop(p_w_author)

                    worked_authors_dic[p_author] = all_p_authors_worked_with
            
            for wad in worked_authors_dic:
                work_ratio = len(worked_authors_dic[wad])/coauthors_count
                # print('author', wad, 'ratio', work_ratio)

                # venue that prolific author has published paper
                ven = find_author_venue(wad)

                #count the common venue
                same_count = 0
                for v in ven.keys():
                    paper_venue = train.iloc[i]['venue']
                    if v == paper_venue:
                        same_count += ven[v]

                #add the features to the new_featurs
                new_featurs['work_ratio'].append(work_ratio)
                new_featurs['same_venue'].append(same_count)
                new_featurs['coauthors_count'].append(coauthors_count)
                new_featurs['prolific_count'].append(prolific_count)
                if wad in true_lable:
                    new_featurs['is_p_author'].append(1)
                else:
                    new_featurs['is_p_author'].append(0)

            # print('all_prolific_authors', all_prolific_authors)
            # print('the coauthors of paper', i, 'are', authors)
            # print('worked authors dic', worked_authors_dic)
            
            print('== ',i,'======================='*5)
    
    #return the new features as pandas dataframe
    new_featurs = pd.DataFrame(new_featurs)

    #Save the data into a json file
    new_featurs.to_json('new_featurs.json')

    return new_featurs

#create a logistic regression model
def logistic_regression_model(X_train, y_train, penalty='l2', C=1.0, solver='liblinear', max_iter=100, l1_ratio=None):
    logistic_regression = LogisticRegression(penalty, C, solver, max_iter, l1_ratio)
    logistic_regression.fit(X_train, y_train)
    return logistic_regression

# Determine whether the author is the author of the paper using logistic regression
def is_prolific_author(author,paper_info):
    authors = paper_info['coauthors']
    venue = paper_info['venue']
    return 1


feature_engineering(X_train[0:1000], y_train[0:1000])



KeyboardInterrupt: 