In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,accuracy_score

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

### 1. Data loading and data frame creation

The code in this section must be executed together, even if it is divided into different cells.

In [9]:
# read author id and h_index for training
df_train = pd.read_csv('../Data/train.csv', dtype={'author': np.int64,'h_index': np.float32})
df_test = pd.read_csv('../Data/test.csv', index_col=0, dtype={'author': np.int64})
n_train = df_train.shape[0]
n_test = df_test.shape[0]
print("n_train: {0}, n_test: {1}".format(n_train,n_test))

n_train: 174241, n_test: 43560


In [10]:
# read author embedding from graph nodes
dim_embeddings = 32
cols_embeddings = ["n_embedding_"+str(i) for i in range(dim_embeddings)]
dict_dtype = {'author_id':np.int64}
for i in range(dim_embeddings):
    dict_dtype['n_embeddings_'+str(i)] = np.float32
df_node_emb=pd.read_csv("../Graph features/node_embeddings_20.csv",sep=',',dtype=dict_dtype)

# read author embedding from abstracts
dim_embeddings = 64
cols_embeddings = ["at_embedding_"+str(i) for i in range(dim_embeddings)]
dict_dtype = {'author_id':np.int64}
for i in range(dim_embeddings):
    dict_dtype['at_embeddings_'+str(i)] = np.float32
df_author_emb=pd.read_csv("../Text features/author_embeddings_64.csv",dtype=dict_dtype)
'''
dict_dtype = {'author_id':np.int64,'core_number':np.float32,'clustering_coef':np.float32,
             'betweeness_coef':np.float32,'centrality':np.float32,'page_rank':np.float32,
              'clustering_coef_coauthorship':np.float32,'betweeness_coef_coauthorship':np.float32,
              'centrality_coauthorship':np.float32,'page_rank_coauthorship':np.float32}
df_graph_feat=pd.read_csv("../Graph features/graph_features.csv", dtype=dict_dtype)
'''
dict_dtype = {'author_id':np.int64,'core_number':np.float32,'clustering_coef':np.float32,
             'betweeness_coef':np.float32,'centrality':np.float32,'page_rank':np.float32,
              'clustering_coef_coauthorship':np.float32,'betweeness_coef_coauthorship':np.float32,
              'centrality_coauthorship':np.float32,'page_rank_coauthorship':np.float32,
              'degree':np.int32,'weighted_degree':np.float32,'onion_number':np.float32
              }
df_graph_feat=pd.read_csv("../Graph features/graph_features_v2.csv", dtype=dict_dtype)

print("Loaded data.")

Loaded data.


In [11]:
# create the training dataframe.
df_train = df_train.merge(df_node_emb, left_on="author", right_on="author_id")
df_train = df_train.merge(df_author_emb, left_on="author", right_on="author_id")
df_train = df_train.merge(df_graph_feat, left_on="author", right_on="author_id")

X_id_train = df_train["author"].values
y_train = df_train["hindex"].values


features_to_drop = ["author","hindex","author_id_x","author_id_y",
                    "clustering_coef_coauthorship","betweeness_coef_coauthorship",
                    "centrality_coauthorship","page_rank_coauthorship","author_id"]
'''
features_to_drop = ["author","hindex","author_id_x","author_id_y",
                    "betweeness_coef_coauthorship",
                    "author_id"]
'''

df_train = df_train.drop(features_to_drop,axis=1)
X_train = df_train.values
print("Column names X_train: \n",df_train.columns.tolist())
print("Created X,y for training.")
print("Dimensions of X_train: ",X_train.shape)
print("Dimensions of y_train: ",y_train.shape)

# create the test dataframe. 
df_test = df_test.merge(df_node_emb, left_on="author", right_on="author_id")
df_test = df_test.merge(df_author_emb, left_on="author", right_on="author_id")
df_test = df_test.merge(df_graph_feat, left_on="author", right_on="author_id")

X_id_test = df_test["author"].values
y_test = df_test["hindex"].values

df_test = df_test.drop(features_to_drop,axis=1)
X_test = df_test.values
print("\nColumn names X_test: \n",df_test.columns.tolist())
print("Created X,y for testing.")
print("Dimensions of X_test: ",X_test.shape)
print("Dimensions of y_test: ",y_test.shape)

Column names X_train: 
 ['n_embedding_0', 'n_embedding_1', 'n_embedding_2', 'n_embedding_3', 'n_embedding_4', 'n_embedding_5', 'n_embedding_6', 'n_embedding_7', 'n_embedding_8', 'n_embedding_9', 'n_embedding_10', 'n_embedding_11', 'n_embedding_12', 'n_embedding_13', 'n_embedding_14', 'n_embedding_15', 'n_embedding_16', 'n_embedding_17', 'n_embedding_18', 'n_embedding_19', 'at_embedding_0', 'at_embedding_1', 'at_embedding_2', 'at_embedding_3', 'at_embedding_4', 'at_embedding_5', 'at_embedding_6', 'at_embedding_7', 'at_embedding_8', 'at_embedding_9', 'at_embedding_10', 'at_embedding_11', 'at_embedding_12', 'at_embedding_13', 'at_embedding_14', 'at_embedding_15', 'at_embedding_16', 'at_embedding_17', 'at_embedding_18', 'at_embedding_19', 'at_embedding_20', 'at_embedding_21', 'at_embedding_22', 'at_embedding_23', 'at_embedding_24', 'at_embedding_25', 'at_embedding_26', 'at_embedding_27', 'at_embedding_28', 'at_embedding_29', 'at_embedding_30', 'at_embedding_31', 'at_embedding_32', 'at_embe

In [18]:
## lightgbm

def lightgbm(X_train, y_train,metrics:list=['mean_squared_error'], 
                  transformation=None,transformation_inv=None, 
                  n_splits=3,seed=2,n_jobs=1,colsample_bytree=0.4,
                  alpha=0.05,learning_rate=0.025,n_estimators=6000):
    
    '''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    val_size: proportion of the y_train to include y_val.
    seed: random number seed.
    n_jobs: number of CPUs to use during the cross validation
    colsample_bytree: subsample ratio of columns (features) when constructing each tree.
    alpha: parameter for Huber loss and Quantile regression.
    learning_rate: boosting learning rate.
    n_estimators: number of boosted trees to fit
    '''
    
    if(transformation is not None):
        try:
            y_train = transformation(y_train)
        except:
            pass
            
    print("Start lightgbm fitting:\n")
    reg = lgb.LGBMRegressor(is_unbalance=True,
                        colsample_bytree=colsample_bytree, 
                        importance_type='gain', alpha=alpha,
                        objective='mse', learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        n_jobs=n_jobs
                        )
    
    skf = StratifiedKFold(n_splits=n_splits,shuffle=True)
    list_mse_train = []
    list_mse_val = []
    list_accuracy_train = []
    list_accuracy_val = []

    count = 0
    for train_index, test_index in skf.split(X_train, y_train):
        count += 1
        X_tr, X_val = X_train[train_index], X_train[test_index]
        y_tr, y_val = y_train[train_index], y_train[test_index]
        reg.fit(X_tr, y_tr)
        mse_train = mean_squared_error(y_tr,reg.predict(X_tr))
        mse_val = mean_squared_error(y_val,reg.predict(X_val))
        
        if(transformation_inv is not None):
            try:
                y_pred_train = transformation_inv(y_pred_train)
                y_pred_val = transformation_inv(y_pred_val)
            except:
                pass
        
        if('mean_squared_error' in metrics):
            list_mse_train.append(mse_train)
            list_mse_val.append(mse_val)
        if('accuracy' in metrics):
            list_accuracy_train.append(accuracy_score(y_tr,reg.predict(X_tr)))
            list_accuracy_val.append(accuracy_score(y_val,reg.predict(X_val)))
        print("Round: {0}, mean_squared_error on train: {1}, mean_squared_error on test: {2}".format(count,mse_train,mse_val));
    
    
    config = "learning_rate: "+str(learning_rate)+", n_splits: "+str(n_splits)
    config += ", colsample_bytree: "+str(colsample_bytree)+", alpha: "
    config += str(alpha)+", n_estimators: "+str(n_estimators)
    print("\nConfig:",config)
    
    if('mean_squared_error' in metrics):
        print("Lightgbm mean squared error on train:",np.mean(list_mse_train))
        print("Lightgbm mean squared error on test:",np.mean(list_mse_val))
    if('accuracy' in metrics):
        print("Lightgbm accuracy on train:",np.mean(list_accuracy_train))
        print("Lightgbm accuracy on test with:",np.mean(list_acuracy_val))
    
    print("Prediction of null features: ",reg.predict(np.zeros((1,len(X_tr[0]))))[0])
    
    return reg,list_mse_train,list_mse_val,config

#best_model = lightgbm(X_train, y_train)
#lightgbm(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

In [19]:
learning_rate = [0.05]
n_splits = [9]
colsample_bytree = [0.4]
alpha = [0.05]
n_estimators = [10000]

#colsample_bytree = [0.2,0.4]
#alpha = [0.01,0.05,0.1]
#n_estimators=[4000,6000,8000]

list_config = []
list_mse_tr = []
list_mse_val = []

min_mse_val = np.inf
best_model = None
best_config = None

for l_r in learning_rate:
    for n_s in n_splits:
        for csbt in colsample_bytree:
            for alp in alpha:
                for n_e in n_estimators:   
                    model,mse_tr,mse_val,config = lightgbm(X_train,y_train,n_splits=n_s,learning_rate=l_r,
                                                   colsample_bytree=csbt,alpha=alp,
                                                   n_estimators=n_e,n_jobs=4)
                    if(min_mse_val > float(mse_val[len(mse_val)-1])):
                        min_mse_val = float(mse_val[len(mse_val)-1])
                        best_model = model
                        best_config = config
                    
                    list_config.append(config)
                    list_mse_tr.append(mse_tr[len(mse_tr)-1])
                    list_mse_val.append(mse_val[len(mse_val)-1])


Start lightgbm fitting:





Round: 1, mean_squared_error on train: 4.298185080599955, mean_squared_error on test: 49.52260932079628
Round: 2, mean_squared_error on train: 4.332692514483274, mean_squared_error on test: 49.366949688926255
Round: 3, mean_squared_error on train: 4.357161506309602, mean_squared_error on test: 47.24373397721559
Round: 4, mean_squared_error on train: 4.306208181907501, mean_squared_error on test: 48.71051594550257
Round: 5, mean_squared_error on train: 4.319044267470873, mean_squared_error on test: 49.68514837673011
Round: 6, mean_squared_error on train: 4.352845557290925, mean_squared_error on test: 50.09833710118123
Round: 7, mean_squared_error on train: 4.324301059464739, mean_squared_error on test: 48.92794191938997
Round: 8, mean_squared_error on train: 4.43883596863128, mean_squared_error on test: 47.97913364701657
Round: 9, mean_squared_error on train: 4.367181769606134, mean_squared_error on test: 48.72138360611254

Config: learning_rate: 0.05, n_splits: 9, colsample_bytree: 0.4

In [15]:
model,list_losses_tr,list_losses_val,_ = lightgbm(X_train,y_train,n_splits=7,learning_rate=0.05,
                                                   colsample_bytree=0.4,alpha=0.05,
                                                   n_estimators=10000,n_jobs=4)

Start lightgbm fitting:





Round: 1, mean_squared_error on train: 4.0302409349629436, mean_squared_error on test: 54.12540992740144
Round: 2, mean_squared_error on train: 4.126217089698763, mean_squared_error on test: 48.46673601194754
Round: 3, mean_squared_error on train: 4.106172070769131, mean_squared_error on test: 47.759054141192145
Round: 4, mean_squared_error on train: 4.1085154827925425, mean_squared_error on test: 52.32106331406688
Round: 5, mean_squared_error on train: 4.054965685226876, mean_squared_error on test: 50.552161299282886
Round: 6, mean_squared_error on train: 4.107422803002861, mean_squared_error on test: 49.087722647223
Round: 7, mean_squared_error on train: 4.056721266597967, mean_squared_error on test: 43.72697983372407

Config: learning_rate: 0.05, n_splits: 7, colsample_bytree: 0.4, alpha: 0.05, n_estimators: 10000
Lightgbm mean squared error on train: 4.084322190435869
Lightgbm mean squared error on test: 49.43416102497685
Prediction of null features:  0.5902981019968244


In [14]:
# saving predictions for the test data

h_test = model.predict(X_test)
predictions = pd.DataFrame()
predictions['author'] = X_id_test
predictions['hindex'] = h_test
predictions.to_csv('submission.csv',index=False)

In [None]:
fig, (ax1) = plt.subplots(1, 1, figsize=(16,8))
ax1.plot([loss.cpu() for loss in list_losses_tr][:100], c='b', label='train')
ax1.plot([loss.cpu() for loss in list_losses_val][:100], c='r', label='test')
ax1.set_title("Train and validation accuracy")
ax1.set_xlabel("Iterations")
plt.legend()
plt.show()