In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import lightgbm as lgb

from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import time as time
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV

In [30]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [31]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [32]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [6]:
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'f1',
          'verbosity': -1,
          'random_state': 42, 
          'reg_alpha': 0, 
          'reg_lambda': 0, 
          'colsample_bytree': 0.5, 
          'max_depth': 15, 
          'num_leaves': 60, 
          'min_child_samples': 30, 
          'min_data_per_groups': 15
         }

In [7]:
# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': np.arange(50, 500, 50),
    'learning_rate': np.linspace(0.01, 0.3, 30),
    'subsample': np.linspace(0.5, 1.0, 6),
}

In [8]:
# Set up the XGBClassifier
lgb_model = lgb.LGBMClassifier(**params)

# Set up KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

setup=time.time()
# Set up the RandomizedSearchCV with k-fold cross-validation
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='f1',
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    error_score='raise'
)
setup_end=time.time()
print(setup_end-setup)


0.0


In [9]:
# Fit the RandomizedSearchCV with the training data
fitting_start=time.time()
X1=train_vectors.astype(np.float64)
random_search.fit(X1, train_df['target'])
fitting_end=time.time()
print(fitting_end-fitting_start)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
84.81478261947632


In [10]:
# Get the best parameters
bparam_start=time.time()
best_params = random_search.best_params_
print("Best parameters found: ", best_params)
bparam_end=time.time()
print(bparam_end-bparam_start)

Best parameters found:  {'subsample': 0.7, 'n_estimators': 450, 'learning_rate': 0.08999999999999998}
0.0007195472717285156


In [50]:
def cross_val_train(X, y, test, params):
    
    spl=10   # Number of folders
    
    # Inititate arrays with predictions and oof predictions
    test_preds = np.zeros((len(test),2))
    val_preds = np.zeros((len(X),2))
    val_scores, train_scores = [],[]
    
    # perform cross-validation split
    cv = KFold(spl, shuffle=True, random_state=42)
    
    # "for" cycle to train for each fold
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X,y)):
        
        # divide train and validation data
        X_train = np.asarray(X[train_ind])
        y_train = y[train_ind]
        X_val = np.asarray(X[valid_ind])
        y_val = y[valid_ind]
        
        # Initiate model lightGBM 
        model = lgb.LGBMClassifier(**params,**best_params)
        
        # fit the model
        model.fit(X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    callbacks=[lgb.early_stopping(stopping_rounds=70), lgb.log_evaluation(100)], eval_metric='logloss') 
        
        # predictions on train and validation data
        y_pred_trn = model.predict_proba(X_train)
        y_pred_val = model.predict_proba(X_val)
        
        # compute accuracy
        train_acc = accuracy_score(y_train, np.argmax(y_pred_trn, axis=1))
        val_acc = accuracy_score(y_val, np.argmax(y_pred_val, axis=1))
        
        # print partial results for the fold
        print("Fold:",fold, " Train R2:",np.round(train_acc,5), " Val R2:",np.round(val_acc,5))
        
        # compute test predictions and oof predictions
        test=np.asarray(test)
        test_preds += model.predict_proba(test)/spl
        val_preds[valid_ind] = model.predict_proba(X_val)
        val_scores.append(val_acc)
        print("-"*50)
        
    return val_scores, val_preds, test_preds

In [56]:
len(test_vectors)

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

In [58]:
val_scores, val_preds, test_preds = cross_val_train(train_vectors.todense(), train_df['target'], test_vectors.todense(), params)

Training until validation scores don't improve for 70 rounds
[100]	valid_0's binary_logloss: 0.512653
[200]	valid_0's binary_logloss: 0.511972
Early stopping, best iteration is:
[181]	valid_0's binary_logloss: 0.509478
Fold: 0  Train R2: 0.83243  Val R2: 0.76772
--------------------------------------------------
Training until validation scores don't improve for 70 rounds
[100]	valid_0's binary_logloss: 0.482039
[200]	valid_0's binary_logloss: 0.475863
Early stopping, best iteration is:
[189]	valid_0's binary_logloss: 0.475135
Fold: 1  Train R2: 0.83856  Val R2: 0.77822
--------------------------------------------------
Training until validation scores don't improve for 70 rounds
[100]	valid_0's binary_logloss: 0.464153
[200]	valid_0's binary_logloss: 0.444705
[300]	valid_0's binary_logloss: 0.442124
Early stopping, best iteration is:
[277]	valid_0's binary_logloss: 0.441476
Fold: 2  Train R2: 0.8463  Val R2: 0.80052
--------------------------------------------------
Training until val

In [59]:
val_preds_out = np.argmax(val_preds, axis=1)
accuracy_score(train_df['target'], val_preds_out)

0.7828714041770656

In [60]:
test_preds_out=np.argmax(test_preds, axis=1)

In [61]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = np.argmax(test_preds, axis=1)
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [62]:
sample_submission.to_csv("submission.csv", index=False)