In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import lightgbm as lgb
import torch
import torch.nn as nn

### 1. Data loading and data frame creation

The code in this section must be executed together, even if it is divided into different cells.

In [4]:
# read author id and h_index for training
df_train = pd.read_csv('../Data/train.csv', dtype={'author': np.int64,'h_index': np.float32})
df_test = pd.read_csv('../Data/test.csv', index_col=0, dtype={'author': np.int64})
n_train = df_train.shape[0]
n_test = df_test.shape[0]
print("n_train: {0}, n_test: {1}".format(n_train,n_test))

n_train: 174241, n_test: 43560


In [5]:
# read author embedding from graph nodes
dim_embeddings = 20
cols_embeddings = ["n_embedding_"+str(i) for i in range(dim_embeddings)]
dict_dtype = {'author_id':np.int64}
for i in range(dim_embeddings):
    dict_dtype['n_embeddings_'+str(i)] = np.float32
df_node_emb=pd.read_csv("../Graph features/node_embeddings_20.csv",sep=',',dtype=dict_dtype)

# read author embedding from abstracts
dim_embeddings = 64
cols_embeddings = ["at_embedding_"+str(i) for i in range(dim_embeddings)]
dict_dtype = {'author_id':np.int64}
for i in range(dim_embeddings):
    dict_dtype['at_embeddings_'+str(i)] = np.float32
df_author_emb=pd.read_csv("../Text features/author_embeddings_64.csv",dtype=dict_dtype)

dict_dtype = {'author_id':np.int64,'core_number':np.float32,'clustering_coef':np.float32,
             'betweeness_coef':np.float32,'centrality':np.float32,'page_rank':np.float32,
              'clustering_coef_coauthorship':np.float32,'betweeness_coef_coauthorship':np.float32,
              'centrality_coauthorship':np.float32,'page_rank_coauthorship':np.float32,
              'degree':np.int32,'weighted_degree':np.float32,'onion_number':np.float32
              }
df_graph_feat=pd.read_csv("../Graph features/graph_features_v2.csv", dtype=dict_dtype)
print("Loaded data.")

Loaded data.


In [6]:
# create the training dataframe.
df_train = df_train.merge(df_node_emb, left_on="author", right_on="author_id")
df_train = df_train.merge(df_author_emb, left_on="author", right_on="author_id")
df_train = df_train.merge(df_graph_feat, left_on="author", right_on="author_id")

X_id_train = df_train["author"].values
y_train = df_train["hindex"].values

'''
features_to_drop = ["author","hindex","author_id_x","author_id_y",
                    "clustering_coef_coauthorship","betweeness_coef_coauthorship",
                    "centrality_coauthorship","page_rank_coauthorship","author_id"]
'''
features_to_drop = ["author","hindex","author_id_x","author_id_y",
                    "betweeness_coef_coauthorship",
                    "author_id"]

df_train = df_train.drop(features_to_drop,axis=1)
X_train = df_train.values
print("Column names X_train: \n",df_train.columns.tolist())
print("Created X,y for training.")
print("Dimensions of X_train: ",X_train.shape)
print("Dimensions of y_train: ",y_train.shape)

# create the test dataframe. 
df_test = df_test.merge(df_node_emb, left_on="author", right_on="author_id")
df_test = df_test.merge(df_author_emb, left_on="author", right_on="author_id")
df_test = df_test.merge(df_graph_feat, left_on="author", right_on="author_id")

X_id_test = df_test["author"].values
y_test = df_test["hindex"].values

df_test = df_test.drop(features_to_drop,axis=1)
X_test = df_test.values
print("\nColumn names X_test: \n",df_test.columns.tolist())
print("Created X,y for testing.")
print("Dimensions of X_test: ",X_test.shape)
print("Dimensions of y_test: ",y_test.shape)

Column names X_train: 
 ['n_embedding_0', 'n_embedding_1', 'n_embedding_2', 'n_embedding_3', 'n_embedding_4', 'n_embedding_5', 'n_embedding_6', 'n_embedding_7', 'n_embedding_8', 'n_embedding_9', 'n_embedding_10', 'n_embedding_11', 'n_embedding_12', 'n_embedding_13', 'n_embedding_14', 'n_embedding_15', 'n_embedding_16', 'n_embedding_17', 'n_embedding_18', 'n_embedding_19', 'at_embedding_0', 'at_embedding_1', 'at_embedding_2', 'at_embedding_3', 'at_embedding_4', 'at_embedding_5', 'at_embedding_6', 'at_embedding_7', 'at_embedding_8', 'at_embedding_9', 'at_embedding_10', 'at_embedding_11', 'at_embedding_12', 'at_embedding_13', 'at_embedding_14', 'at_embedding_15', 'at_embedding_16', 'at_embedding_17', 'at_embedding_18', 'at_embedding_19', 'at_embedding_20', 'at_embedding_21', 'at_embedding_22', 'at_embedding_23', 'at_embedding_24', 'at_embedding_25', 'at_embedding_26', 'at_embedding_27', 'at_embedding_28', 'at_embedding_29', 'at_embedding_30', 'at_embedding_31', 'at_embedding_32', 'at_embe

### 2. Regression and Classification and Regression Trees (CART)

In [7]:
## lasso linear model with iterative fitting along a regularization path

def lasso_regression(X_train, y_train,metrics:list=['mean_squared_error'], 
                     transformation=None,transformation_inv=None,
                     val_size=0.25,seed=2):
    '''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    val_size: proportion of the y_train to include y_val.
    seed: random number seed.
    '''
    
    
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=seed) 
    if(transformation is not None):
        try:
            y_tr = transformation(y_tr)
            y_val = transformation(y_val)
        except:
            pass
            
    reg = make_pipeline(StandardScaler(), LassoCV())
    print("Start lasso regression fitting")
    reg.fit(X_tr, y_tr)
    y_pred_tr = reg.predict(X_tr)
    y_pred_val = reg.predict(X_val)
        
    if(transformation_inv is not None):
        try:
            y_pred_tr = transformation_inv(y_pred_tr)
            y_pred_val = transformation_inv(y_pred_val)
        except:
            pass
    
    if('mean_squared_error' in metrics):
        print("Lasso mean_squared_error on train:",mean_squared_error(y_tr, y_pred_tr))
        print("Lasso mean_squared_error on test:",mean_squared_error(y_val, y_pred_val))
    if('accuracy' in metrics):
        print("Lasso accuracy on train:",accuracy_score(y_tr, y_pred_tr))
        print("Lasso accuracy on test with:",accuracy_score(y_val, y_pred_val))
    
    print("Prediction of null features: ",reg.predict(np.zeros((1,len(X_tr[0]))))[0])
    
    return reg

lasso_regression(X_train, y_train)
#lasso_regression(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

Start lasso regression fitting
Lasso mean_squared_error on train: 76.67554851815049
Lasso mean_squared_error on test: 76.04040684516922
Prediction of null features:  -1.7138465265660834


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lassocv',
                 LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001,
                         fit_intercept=True, max_iter=1000, n_alphas=100,
                         n_jobs=None, normalize=False, positive=False,
                         precompute='auto', random_state=None,
                         selection='cyclic', tol=0.0001, verbose=False))],
         verbose=False)

In [8]:
## knn

def knn(X_train, y_train,metrics:list=['mean_squared_error'], 
                  transformation=None,transformation_inv=None, 
                  val_size=0.25,seed=2,n_jobs=1,neighbours=7):
    
    '''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    val_size: proportion of the y_train to include y_val.
    seed: random number seed.
    '''
    
    
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=seed) 
    if(transformation is not None):
        try:
            y_tr = transformation(y_tr)
            y_val = transformation(y_val)
        except:
            pass
            
    reg = reg = KNeighborsClassifier(n_neighbors=neighbours, n_jobs=n_jobs)
    print("Start lasso regression fitting")
    reg.fit(X_tr, y_tr)
    y_pred_tr = reg.predict(X_tr)
    y_pred_val = reg.predict(X_val)
        
    if(transformation_inv is not None):
        try:
            y_pred_tr = transformation_inv(y_pred_tr)
            y_pred_val = transformation_inv(y_pred_val)
        except:
            pass
    
    if('mean_squared_error' in metrics):
        print("KNN mean_squared_error on train:",mean_squared_error(y_tr, y_pred_tr))
        print("KNN mean_squared_error on test:",mean_squared_error(y_val, y_pred_val))
    if('accuracy' in metrics):
        print("KNN accuracy on train:",accuracy_score(y_tr, y_pred_tr))
        print("KNN accuracy on test with:",accuracy_score(y_val, y_pred_val))
    
    print("Prediction of null features: ",reg.predict(np.zeros((1,len(X_tr[0]))))[0])
    
    return reg

knn(X_train, y_train)
#knn(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

Start lasso regression fitting
KNN mean_squared_error on train: 111.62180899908172
KNN mean_squared_error on test: 119.50079199283763
Prediction of null features:  1.0


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=7, p=2,
                     weights='uniform')

In [15]:
## feed-forward neural network (ffnn)

def ffnn(X_train, y_train,metrics:list=['mean_squared_error'], 
                     transformation=None,transformation_inv=None,
                     val_size=0.25,seed=2,units_hidden_layers:list=[25,5],
                     activation_function=nn.LeakyReLU(0.1),loss_function=nn.MSELoss(),
                     learning_rate=0.01,dropout_rate=0.2,num_epochs=10000):
    '''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    val_size: proportion of the y_train to include y_val.
    seed: random number seed.
    units_hidden_layers: list of units in each hidden layer. Currently two hidden layers supported.
    activation_function: activation function to be used for the output of all the hidden layers.
    dropout_rate: droptout rate. It turn off some neurons randomly.
    num_epochs: number of epochs in the training phase.
    '''
    
    y_train = y_train.reshape(-1,1)
    
    

    # split training data for validation purposes
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=seed) 
    
    # save a copy of the data
    X_tr_orig = np.copy(X_tr)
    X_val_orig = np.copy(X_val)
    y_tr_orig = np.copy(y_tr)
    y_val_orig = np.copy(y_val)
    
    # standardize features by removing the mean and scaling to unit variance
    
    scaler_x = StandardScaler()
    scaler_x.fit(X_tr)
    X_tr = scaler_x.transform(X_tr)
    X_val = scaler_x.transform(X_val)
    
    scaler_y = StandardScaler()
    scaler_y.fit(y_train)
    y_tr = scaler_y.transform(y_tr)
    y_val = scaler_y.transform(y_val)
    
    if(transformation is not None):
        try:
            y_tr = transformation(y_tr)
            y_val = transformation(y_val)
        except:
            pass
    
    # speed the computations using a GPU if available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # convert the data from Numpy arrays to Tensors
    X_tr = torch.from_numpy(X_tr).float().to(device)
    X_val = torch.from_numpy(X_val).float().to(device)
    y_tr = torch.from_numpy(y_tr).float().to(device)
    y_val = torch.from_numpy(y_val).float().to(device)
    
    # create the model and send it to the fastest available device
    model = nn.Sequential(
    nn.Linear(X_train.shape[1], units_hidden_layers[0], bias=True),
    activation_function,
    nn.Dropout(dropout_rate),
    nn.Linear(units_hidden_layers[0], units_hidden_layers[1], bias=True),
    activation_function,
    nn.Linear(units_hidden_layers[1], 1, bias=True),
    nn.ReLU()
    )
    model.to(device)
    
    # initialize weights randomly
    def init_weights(mod):
        if isinstance(mod, nn.Linear):
            nn.init.xavier_uniform_(mod.weight)
            mod.bias.data.fill_(0.01)
    
    model.apply(init_weights)
    
    # choose the loss function and the optimizer
    loss_function = loss_function
    #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # init performance measures
    list_losses_tr = []
    list_losses_val = []
    list_accuracy_tr = []
    list_accuracy_val = []
    
    for epoch in range(num_epochs):
        y_pred_tr=model.forward(X_tr)
        loss_tr = loss_function(y_pred_tr, y_tr)
        optimizer.zero_grad() # required since pytorch accumulates the gradients
        loss_tr.backward() # backpropagation step
        optimizer.step() # update the parameters
            
        # update loss and accuracy on train and test
        y_pred_val=model.forward(X_val)
        loss_val = loss_function(y_pred_val, y_val)
        
        with torch.no_grad():
            if('mean_squared_error' in metrics):
                list_losses_tr.append(loss_tr.data)
                list_losses_val.append(loss_val.data)
            if('accuracy' in metrics):
                list_accuracy_tr.append(accuracy_score(y_tr.cpu(), y_pred_tr.cpu()))
                list_accuracy_val.append(accuracy_score(y_val.cpu(), y_pred_val.cpu()))
        
        # print results from time to time (fixed frequency)
        if(epoch % int(num_epochs/5) == 0):
            print('Epoch: {0},\tLoss train: {1},\tLoss val: {2}'.format(epoch,round(loss_tr.cpu().data.item(),5),round(loss_val.cpu().data.item(),5)))
        
    
    with torch.no_grad():
    
        if(transformation_inv is not None):
            try:
                y_pred_tr = transformation_inv(y_pred_tr.cpu())
                y_pred_val = transformation_inv(y_pred_val.cpu())
            except:
                pass
        config = "learning_rate: "+str(learning_rate)+", val_size: "+str(val_size)
        config += ", dropout_rate: "+str(dropout_rate)+", activation_function: "
        config += str(activation_function)+", units_hidden_layers: "+str(units_hidden_layers)
        
        mse_tr = -1
        mse_val = -1
        
        print("\nConfig:",config)
        if('mean_squared_error' in metrics):
            mse_tr = mean_squared_error(y_tr_orig,scaler_y.inverse_transform(y_pred_tr.cpu()))
            mse_val = mean_squared_error(y_val_orig,scaler_y.inverse_transform(y_pred_val.cpu()))
            print("FFNN mean_squared_error on train:",mse_tr)
            print("FFNN mean_squared_error on test:",mse_val)
        if('accuracy' in metrics):
            print("FFNN accuracy on train:",accuracy_score(y_tr.cpu(), y_pred_tr.cpu()))
            print("FFNN accuracy on test with:",accuracy_score(y_val.cpu(), y_pred_val.cpu()))
        
        null_vect = torch.Tensor(np.zeros((1,len(X_tr[0])))).float().to(device)
        print("Prediction of null features: ",model.forward(null_vect).cpu()[0][0])
    
    return model,list_losses_tr,list_losses_val,config,mse_tr,mse_val

ffnn(X_train, y_train)
#ffnn(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

Epoch: 0,	Loss train: 2.01766,	Loss val: 1.37733
Epoch: 2000,	Loss train: 0.47395,	Loss val: 0.52425
Epoch: 4000,	Loss train: 0.46915,	Loss val: 0.51809
Epoch: 6000,	Loss train: 0.46741,	Loss val: 0.52688
Epoch: 8000,	Loss train: 0.46914,	Loss val: 0.52445

Config: learning_rate: 0.01, val_size: 0.25, dropout_rate: 0.2, activation_function: LeakyReLU(negative_slope=0.1), units_hidden_layers: [25, 5]
FFNN mean_squared_error on train: 73.91699369211902
FFNN mean_squared_error on test: 83.24880453022253
Prediction of null features:  tensor(0.3996)


(Sequential(
   (0): Linear(in_features=95, out_features=25, bias=True)
   (1): LeakyReLU(negative_slope=0.1)
   (2): Dropout(p=0.2, inplace=False)
   (3): Linear(in_features=25, out_features=5, bias=True)
   (4): LeakyReLU(negative_slope=0.1)
   (5): Linear(in_features=5, out_features=1, bias=True)
   (6): ReLU()
 ),
 [tensor(2.0177, device='cuda:0'),
  tensor(1.3771, device='cuda:0'),
  tensor(1.0396, device='cuda:0'),
  tensor(0.8686, device='cuda:0'),
  tensor(0.7912, device='cuda:0'),
  tensor(0.7529, device='cuda:0'),
  tensor(0.7294, device='cuda:0'),
  tensor(0.7148, device='cuda:0'),
  tensor(0.7010, device='cuda:0'),
  tensor(0.6867, device='cuda:0'),
  tensor(0.6765, device='cuda:0'),
  tensor(0.6626, device='cuda:0'),
  tensor(0.6567, device='cuda:0'),
  tensor(0.6499, device='cuda:0'),
  tensor(0.6448, device='cuda:0'),
  tensor(0.6383, device='cuda:0'),
  tensor(0.6331, device='cuda:0'),
  tensor(0.6277, device='cuda:0'),
  tensor(0.6244, device='cuda:0'),
  tensor(0.6193

In [8]:
## random forest

def random_forest(X_train, y_train,metrics:list=['mean_squared_error'], 
                  transformation=None,transformation_inv=None, 
                  n_splits=3,seed=2,n_jobs=1):
    
    '''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    n_splits: number of folds. It must be at least 2.
    seed: random number seed.
    n_jobs: number of CPUs to use during the cross validation
    '''
    
    print("Start random forest fitting:\n")
    reg = RandomForestRegressor(n_jobs=n_jobs)
    
    skf = StratifiedKFold(n_splits=n_splits)
    list_mse_tr = []
    list_mse_val = []
    list_accuracy_tr = []
    list_accuracy_val = []

    count = 0
    for train_index, test_index in skf.split(X_train, y_train):
        count += 1
        X_tr, X_val = X_train[train_index], X_train[test_index]
        y_tr, y_val = y_train[train_index], y_train[test_index]
        reg.fit(X_tr, y_tr)
        
        # predict y and calculate the loss (MSE)
        y_pred_tr = reg.predict(X_tr)
        y_pred_val = reg.predict(X_val)
        mse_tr = mean_squared_error(y_tr,y_pred_tr)
        mse_val = mean_squared_error(y_val,y_pred_val)
        
        if(transformation_inv is not None):
            try:
                y_pred_tr = transformation_inv(y_pred_tr)
                y_pred_val = transformation_inv(y_pred_val)
            except:
                pass
        
        if('mean_squared_error' in metrics):
            list_mse_tr.append(mse_tr)
            list_mse_val.append(mse_val)
        if('accuracy' in metrics):
            list_accuracy_tr.append(accuracy_score(y_tr,reg.predict(X_tr)))
            list_accuracy_val.append(accuracy_score(y_val,reg.predict(X_val)))
        print("Round: {0}, mean_squared_error on train: {1}, mean_squared_error on test: {2}".format(count,mse_tr,mse_val));
    
    
    if('mean_squared_error' in metrics):
        print("Random forest regressor mean squared error on train:",np.mean(list_mse_tr))
        print("Random forest regressor mean squared error on test:",np.mean(list_mse_val))
    if('accuracy' in metrics):
        print("Random forest accuracy on train:",np.mean(list_accuracy_tr))
        print("Random forest accuracy on test with:",np.mean(list_accuracy_val))
    
    print("Prediction of null features: ",reg.predict(np.zeros((1,len(X_tr[0]))))[0])
    
    return reg

random_forest(X_train, y_train, n_jobs=4)
#random_forest(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

Start random forest fitting:





Round: 1, mean_squared_error on train: 8.635870579373279, mean_squared_error on test: 65.32265243711369
Round: 2, mean_squared_error on train: 8.593233572369384, mean_squared_error on test: 66.43851847623966
Round: 3, mean_squared_error on train: 9.234641759282377, mean_squared_error on test: 58.91511822658401
Random forest regressor mean squared error on train: 8.821248637008345
Random forest regressor mean squared error on test: 63.55876304664579
Prediction of null features:  1.72


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=4, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [9]:
## xgboost

def xgboost(X_train, y_train,metrics:list=['mean_squared_error'], 
                  transformation=None,transformation_inv=None, 
                  n_splits=3,seed=2,n_jobs=1,n_estimators:int=50):
    ''''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    n_splits: number of folds. It must be at least 2.
    seed: random number seed.
    n_jobs: number of CPUs to use during the cross validation
    n_estimators : Number of gradient boosted trees. Equivalent to number of boosting rounds.
    '''
    
           
    print("Start xgboost fitting:\n")
    reg = xgb.XGBRegressor(n_estimators = n_estimators, n_jobs=n_jobs)
    
    skf = StratifiedKFold(n_splits=n_splits)
    list_mse_tr = []
    list_mse_val = []
    list_accuracy_tr = []
    list_accuracy_val = []

    count = 0
    for train_index, test_index in skf.split(X_train, y_train):
        count += 1
        X_tr, X_val = X_train[train_index], X_train[test_index]
        y_tr, y_val = y_train[train_index], y_train[test_index]
        reg.fit(X_tr, y_tr)
        
        # predict y and calculate the loss (MSE)
        y_pred_tr = reg.predict(X_tr)
        y_pred_val = reg.predict(X_val)
        mse_tr = mean_squared_error(y_tr,y_pred_tr)
        mse_val = mean_squared_error(y_val,y_pred_val)
        
        if(transformation_inv is not None):
            try:
                y_pred_tr = transformation_inv(y_pred_tr)
                y_pred_val = transformation_inv(y_pred_val)
            except:
                pass
        
        if('mean_squared_error' in metrics):
            list_mse_tr.append(mse_tr)
            list_mse_val.append(mse_val)
        if('accuracy' in metrics):
            list_accuracy_tr.append(accuracy_score(y_tr,reg.predict(X_tr)))
            list_accuracy_val.append(accuracy_score(y_val,reg.predict(X_val)))
        print("Round: {0}, mean_squared_error on train: {1}, mean_squared_error on test: {2}".format(count,mse_tr,mse_val));
    
    
    if('mean_squared_error' in metrics):
        print("XGBoost mean squared error on train:",np.mean(list_mse_tr))
        print("XGBoost mean squared error on test:",np.mean(list_mse_val))
    if('accuracy' in metrics):
        print("XGBoost accuracy on train:",np.mean(list_accuracy_tr))
        print("XGBoost accuracy on test with:",np.mean(list_accuracy_val))
        
    print("Prediction of null features: ",reg.predict(np.zeros((1,len(X_tr[0]))))[0])
    
    return reg

xgboost(X_train, y_train, n_jobs=4)
#xgboost(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

Start xgboost fitting:





Round: 1, mean_squared_error on train: 43.18323071927775, mean_squared_error on test: 62.00633096318887
Round: 2, mean_squared_error on train: 43.26151467388286, mean_squared_error on test: 63.38441260473924
Round: 3, mean_squared_error on train: 46.01613559868656, mean_squared_error on test: 56.47981878385642
XGBoost mean squared error on train: 44.153626997282394
XGBoost mean squared error on test: 60.62352078392818
Prediction of null features:  1.2112536


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=50, n_jobs=4,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [7]:
## lightgbm

def lightgbm(X_train, y_train,metrics:list=['mean_squared_error'], 
                  transformation=None,transformation_inv=None, 
                  n_splits=3,seed=2,n_jobs=1,colsample_bytree=0.4,
                  alpha=0.05,learning_rate=0.025,n_estimators=6000):
    
    '''
    X_train:training data with dependent variables. It can be a dataframe or a numpy matrix.
    y_train: training data with independent variable (h-index). It can be a dataframe or a numpy matrix.
    metrics: metrics to calculate the model performance. It is a list of strings. Only 'mse' and 'accuracy' supported.
    transformation: a function to apply to the partitions of y_train.
    transformation_inv: the inverse of transformation. It is applied to the predictions of the y_train partitions.
    n_splits: number of folds. It must be at least 2.
    seed: random number seed.
    n_jobs: number of CPUs to use during the cross validation
    colsample_bytree: subsample ratio of columns(features) when constructing each tree.
    alpha: parameter for Huber loss and Quantile regression.
    learning_rate: boosting learning rate.
    n_estimators: number of boosted trees to fit
    '''
    
    if(transformation is not None):
        try:
            y_train = transformation(y_train)
        except:
            pass
            
    print("Start lightgbm fitting:\n")
    reg = lgb.LGBMRegressor(is_unbalance=True,
                        colsample_bytree=colsample_bytree, 
                        importance_type='gain', alpha=alpha,
                        objective='mse', learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        n_jobs=n_jobs
                        )
    
    skf = StratifiedKFold(n_splits=n_splits)
    list_mse_tr = []
    list_mse_val = []
    list_accuracy_tr = []
    list_accuracy_val = []

    count = 0
    for train_index, test_index in skf.split(X_train, y_train):
        count += 1
        X_tr, X_val = X_train[train_index], X_train[test_index]
        y_tr, y_val = y_train[train_index], y_train[test_index]
        reg.fit(X_tr, y_tr)
        y_pred_tr = reg.predict(X_tr)
        y_pred_val = reg.predict(X_val)
        mse_tr = mean_squared_error(y_tr,y_pred_tr))
        mse_val = mean_squared_error(y_val,reg.predict(X_val))
        
        if(transformation_inv is not None):
            try:
                y_pred_tr = transformation_inv(y_pred_tr)
                y_pred_val = transformation_inv(y_pred_val)
            except:
                pass
        
        if('mean_squared_error' in metrics):
            list_mse_tr.append(mse_tr)
            list_mse_val.append(mse_val)
        if('accuracy' in metrics):
            list_accuracy_tr.append(accuracy_score(y_tr,reg.predict(X_tr)))
            list_accuracy_val.append(accuracy_score(y_val,reg.predict(X_val)))
        print("Round: {0}, mean_squared_error on train: {1}, mean_squared_error on test: {2}".format(count,mse_train,mse_val));
    
    
    if('mean_squared_error' in metrics):
        print("Lightgbm mean squared error on train:",np.mean(list_mse_tr))
        print("Lightgbm mean squared error on test:",np.mean(list_mse_val))
    if('accuracy' in metrics):
        print("Lightgbm accuracy on train:",np.mean(list_accuracy_tr))
        print("Lightgbm accuracy on test with:",np.mean(list_acuracy_val))
    
    print("Prediction of null features: ",reg.predict(np.zeros((1,len(X_tr[0]))))[0])
    
    return reg

best_model = lightgbm(X_train, y_train, n_jobs=4)
#lightgbm(X_train, y_train, transformation=np.log10,transformation_inv=lambda array: np.power(10,array))

Start lightgbm fitting:





Round: 1, mean_squared_error on train: 13.694545744795674, mean_squared_error on test: 52.453976893227996
Round: 2, mean_squared_error on train: 13.61434806840507, mean_squared_error on test: 53.26295298326549
Round: 3, mean_squared_error on train: 14.13177640416875, mean_squared_error on test: 46.47593186666799
Lightgbm mean squared error on train: 13.813556739123165
Lightgbm mean squared error on test: 50.73095391438716
Prediction of null features:  0.8627312457832965


In [12]:
# saving predictions for the test data

h_test = best_model.predict(X_test)
predictions = pd.DataFrame()
predictions['author'] = X_id_test
predictions['hindex'] = h_test
predictions.to_csv('submission.csv',index=False)