In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
#from sklearn.linear_model import SGDClassifier
#from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score, f1_score
from scipy.sparse import hstack, csr_matrix

import gc
import os
import pickle
import time

In [2]:
PATH_TO_DATA = ('D:/Py/DataFrames/CFT_Contest(Datasouls)/')

In [16]:
train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'))
test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test.csv'))

y_train_values = train['target'].values
print(train.shape)

(1991104, 5)


# Determining the correctness of name

### Data loading

In [3]:
with open(os.path.join(PATH_TO_DATA, 'train_features.pkl'), 'rb') as train_pkl:
    train_features = pickle.load(train_pkl)
with open(os.path.join(PATH_TO_DATA, 'test_features.pkl'), 'rb') as test_pkl:
    test_features = pickle.load(test_pkl)

In [6]:
# [LOAD] TEST TF-IDF pickle file
with open(os.path.join(PATH_TO_DATA, '26_10/test_tfidf.pkl'), 'rb') as test_tfidf_pkl:
    csr_test = pickle.load(test_tfidf_pkl)
    
s_test = csr_matrix(hstack([csr_test, test_features]))

del csr_test, test_features, test_pkl, test_tfidf_pkl
gc.collect()

0

In [9]:
# [LOAD] TRAIN TF-IDF pickle file
with open(os.path.join(PATH_TO_DATA, '26_10/train_tfidf.pkl'), 'rb') as train_tfidf_pkl:
    csr_train = pickle.load(train_tfidf_pkl)
    
s_train = csr_matrix(hstack([csr_train, train_features]))

del csr_train, train_features, train_pkl, train_tfidf_pkl
gc.collect()

106

## Learning (classification into 3 classes)

In [13]:
def run_calculations(train, y, test, func_name=None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_prob = np.zeros((y.shape[0], 3))
        test_preds_prob = []
        avg_test_preds_prob = []
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        for i, (train_index, val_index) in enumerate(skf.split(train, y)):
            X_train, X_val  = train[train_index], train[val_index]
            y_train, y_val = y[train_index], y[val_index]

           # part to include additional functions
            if func_name == 'logreg':
                pred_test_prob, pred_oof_prob = run_logreg(X_train, y_train, X_val, y_val, test)
            elif func_name == 'ridge':
                pred_test_prob, pred_oof_prob = run_ridge(X_train, y_train, X_val, y_val, test)
            elif func_name == 'bernoulli_nb':
                pred_test_prob, pred_oof_prob = run_bernoulli_nb(X_train, y_train, X_val, y_val, test)
            elif func_name == 'multinomial_nb':
                pred_test_prob, pred_oof_prob = run_multinomial_nb(X_train, y_train, X_val, y_val, test)
            elif func_name == 'sgd_classifier':
                pred_test_prob, pred_oof_prob = run_sgd_classifier(X_train, y_train, X_val, y_val, test)
            elif func_name == 'linear_svc':
                pred_test_prob, pred_oof_prob = run_linear_svc(X_train, y_train, X_val, y_val, test)
            else:
                return print('The function to run is not correct')

            y_oof_prob[val_index] = pred_oof_prob
            test_preds_prob.append(list(pred_test_prob))

        avg_test_preds_prob = np.mean(test_preds_prob, axis=0)
            
        return y_oof_prob, avg_test_preds_prob

In [23]:
def run_multinomial_nb(train_X, train_y, val_X, val_y, test_X):
   
    start_time = time.time()
    classifier = MultinomialNB(alpha=1e-4)
    classifier.fit(train_X, train_y)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_prob = classifier.predict_proba(test_X)
    
    pred_oof_prob = classifier.predict_proba(val_X)
    return pred_test_prob, pred_oof_prob

In [25]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'multinomial_nb')

Model training done in 2.1001393795013428 seconds.
Model training done in 2.3646233081817627 seconds.
Model training done in 6.3093931674957275 seconds.
Model training done in 3.7238175868988037 seconds.
Model training done in 3.964337110519409 seconds.
Wall time: 5min 5s


In [31]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/multinomialnb_alpha1e-4_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/multinomialnb_alpha1e-4_test.csv'), index=False)

In [35]:
del train_oof_prob, train_oof, test_pred_prob_list, test_pred
gc.collect()

5329

**Multinomial NB scores:**
- F1: 0.8583698938317763
- Precision 0.8496888639117771
- Recall 0.871720463690965

In [36]:
def run_bernoulli_nb(train_X, train_y, val_X, val_y, test_X):
   
    start_time = time.time()
    classifier = BernoulliNB(alpha=1e-5)
    classifier.fit(train_X, train_y)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_prob = classifier.predict_proba(test_X)
    
    pred_oof_prob = classifier.predict_proba(val_X)
    return pred_test_prob, pred_oof_prob

In [37]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'bernoulli_nb')

Model training done in 2.1167805194854736 seconds.
Model training done in 2.6519722938537598 seconds.
Model training done in 2.255692481994629 seconds.
Model training done in 2.493311882019043 seconds.
Model training done in 2.9912667274475098 seconds.
Wall time: 4min 15s


In [38]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/bernoullinb_alpha1e-5_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/bernoullinb_alpha1e-5_test.csv'), index=False)

In [39]:
del train_oof_prob, train_oof, test_pred_prob_list, test_pred
gc.collect()

0

**Bernoulli NB scores:**
- F1: 0.8583698938317763
- Precision 0.8496888639117771
- Recall 0.871720463690965

In [14]:
def run_logreg(train_X, train_y, val_X, val_y, test_X):
   
    start_time = time.time()
    classifier = LogisticRegression(C=50, random_state=42)
    classifier.fit(train_X, train_y)
    print('Model training done in {} seconds.'.format(time.time() - start_time))

#    pred_test_y = classifier.predict(test_X)
    pred_test_prob = classifier.predict_proba(test_X)
    
#    pred_oof = classifier.predict(val_X)
    pred_oof_prob = classifier.predict_proba(val_X)
    return pred_test_prob, pred_oof_prob # pred_test_y, pred_oof

In [17]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'logreg')

Model training done in 890.8674981594086 seconds.
Model training done in 899.4130616188049 seconds.
Model training done in 748.1651532649994 seconds.


  np.exp(prob, prob)


Model training done in 774.2191443443298 seconds.
Model training done in 813.3616940975189 seconds.
Wall time: 1h 14min 9s


In [18]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/logreg_C50_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/logreg_C50_test.csv'), index=False)

**Logistic regression scores (C=50):**
- F1: 0.922858258574291
- Precision 0.9302636490669167
- Recall 0.9168207153656756

In [49]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'logreg')

Model training done in 720.9846198558807 seconds.
Model training done in 669.339376449585 seconds.
Model training done in 759.0707349777222 seconds.
Model training done in 618.4439723491669 seconds.
Model training done in 735.4050974845886 seconds.
Wall time: 1h 1min 31s


In [50]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/logreg_C1e-1_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/logreg_C1e-1_test.csv'), index=False)

In [51]:
del train_oof_prob, train_oof, test_pred_prob_list, test_pred
gc.collect()

0

**Logistic regression scores (C=1e-1):**
- F1: 0.88032509496412
- Precision 0.9034610530262324
- Recall 0.8621863857216052

In [52]:
def run_linear_svc(train_X, train_y, val_X, val_y, test_X):
   
    start_time = time.time()
    classifier = LinearSVC(C=1e-4, random_state=42)
    classifier.fit(train_X, train_y)
    print('Model training done in {} seconds.'.format(time.time() - start_time))

    pred_test_prob = classifier.decision_function(test_X)
    
    pred_oof_prob = classifier.decision_function(val_X)
    return pred_test_prob, pred_oof_prob

In [79]:
def run_ridge(train_X, train_y, val_X, val_y, test_X):

    start_time = time.time()
    classifier = RidgeClassifier(alpha=1, random_state=42)
    classifier.fit(train_X, train_y)
    print('Model training done in {} seconds.'.format(time.time() - start_time))

    pred_test_prob = classifier.decision_function(test_X)i
    
    pred_oof_prob = classifier.decision_function(val_X)
    return pred_test_prob, pred_oof_prob

## Learning (binary classification)

#### I decided to combine target=0 and target=1 in one class to go to the task binary classification (is it Fullname or not)

In [81]:
train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train.csv'))
train['target'] = train['target'].apply(lambda x: 1 if x == 2 else 0)
y_train_values = train['target'].values

del train
gc.collect()

150

In [97]:
def run_calculations(train, y, test, func_name=None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_prob = np.zeros((y.shape[0], 2))
        test_preds_prob = []
        avg_test_preds_prob = []
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        for i, (train_index, val_index) in enumerate(skf.split(train, y)):
            X_train, X_val  = train[train_index], train[val_index]
            y_train, y_val = y[train_index], y[val_index]

           # part to include additional functions
            if func_name == 'logreg':
                pred_test_prob, pred_oof_prob = run_logreg(X_train, y_train, X_val, y_val, test)
            elif func_name == 'ridge':
                pred_test_prob, pred_oof_prob = run_ridge(X_train, y_train, X_val, y_val, test)
            elif func_name == 'bernoulli_nb':
                pred_test_prob, pred_oof_prob = run_bernoulli_nb(X_train, y_train, X_val, y_val, test)
            elif func_name == 'multinomial_nb':
                pred_test_prob, pred_oof_prob = run_multinomial_nb(X_train, y_train, X_val, y_val, test)
            elif func_name == 'sgd_classifier':
                pred_test_prob, pred_oof_prob = run_sgd_classifier(X_train, y_train, X_val, y_val, test)
            elif func_name == 'linear_svc':
                pred_test_prob, pred_oof_prob = run_linear_svc(X_train, y_train, X_val, y_val, test)
            else:
                return print('The function to run is not correct')

            y_oof_prob[val_index] = pred_oof_prob
            test_preds_prob.append(list(pred_test_prob))

        avg_test_preds_prob = np.mean(test_preds_prob, axis=0)
            
        return y_oof_prob, avg_test_preds_prob

In [65]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'multinomial_nb')

Model training done in 1.2310693264007568 seconds.
Model training done in 5.194786548614502 seconds.
Model training done in 1.6613967418670654 seconds.
Model training done in 1.6013519763946533 seconds.
Model training done in 1.6900887489318848 seconds.
Wall time: 7min 10s


In [66]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_multinomialnb_alpha1e-4_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_multinomialnb_alpha1e-4_test.csv'), index=False)

In [67]:
del train_oof_prob, train_oof, test_pred_prob_list, test_pred
gc.collect()

125

**Multinomial NB scores:**
- F1: 0.9566913660488069
- Precision 0.9788190319296961
- Recall 0.936598856357207

In [71]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'bernoulli_nb')

Model training done in 2.7638161182403564 seconds.
Model training done in 2.791532278060913 seconds.
Model training done in 2.650484323501587 seconds.
Model training done in 2.599717617034912 seconds.
Model training done in 2.2613184452056885 seconds.
Wall time: 11min 19s


In [72]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_bernoullinb_alpha1e-5_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_bernoullinb_alpha1e-5_test.csv'), index=False)

In [73]:
del train_oof_prob, train_oof, test_pred_prob_list, test_pred
gc.collect()

0

**Bernoulli NB scores:**
- F1: 0.9421458133718718
- Precision 0.9244386242294811
- Recall 0.9614800538545566

In [68]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(s_train, y_train_values, s_test, 'logreg')

Model training done in 141.974791765213 seconds.
Model training done in 100.13427305221558 seconds.
Model training done in 101.38522458076477 seconds.
Model training done in 81.07326221466064 seconds.
Model training done in 78.34653067588806 seconds.
Wall time: 18min 38s


In [69]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_logreg_C1e-1_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_logreg_C1e-1_test.csv'), index=False)

In [70]:
del train_oof_prob, train_oof, test_pred_prob_list, test_pred
gc.collect()

0

**Logistic Regression scores (C=1e-1):**
- F1: 0.9706390851188221
- Precision 0.9954900684218698
- Recall 0.9482566797125438

----

**Here I use the predicted probabilities from binary classification as additional features in Logistic Regression**
- This trick has improved the quality of the classification into 3 classes

In [28]:
train_multinomial = pd.read_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_multinomialnb_alpha1e-4_train.csv'))
test_multinomial = pd.read_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_multinomialnb_alpha1e-4_test.csv'))

train_bern = pd.read_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_bernoullinb_alpha1e-5_train.csv'))
test_bern = pd.read_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_bernoullinb_alpha1e-5_test.csv'))

train_logreg = pd.read_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_logreg_C1e-1_train.csv'))
test_logreg = pd.read_csv(os.path.join(PATH_TO_DATA, 'predictions/binary_classification/binary_logreg_C1e-1_test.csv'))

In [35]:
train_biclassification_pred = []
train_biclassification_pred.append(train_multinomial['0'].values)
train_biclassification_pred.append(train_bern['0'].values)
train_biclassification_pred.append(train_logreg['0'].values)

test_biclassification_pred = []
test_biclassification_pred.append(test_multinomial['0'].values)
test_biclassification_pred.append(test_bern['0'].values)
test_biclassification_pred.append(test_logreg['0'].values)

In [38]:
del train_multinomial, test_multinomial, train_bern, test_bern, train_logreg, test_logreg
gc.collect()

1212

In [44]:
csr_train = csr_matrix(hstack([s_train, np.array(train_biclassification_pred).T]))
csr_test = csr_matrix(hstack([s_test, np.array(test_biclassification_pred).T]))

In [45]:
del s_train, train_biclassification_pred, s_test, test_biclassification_pred
gc.collect()

0

In [48]:
def run_logreg(train_X, train_y, val_X, val_y, test_X):
   
    start_time = time.time()
    classifier = LogisticRegression(C=50, random_state=42)
    classifier.fit(train_X, train_y)
    print('Model training done in {} seconds.'.format(time.time() - start_time))

    pred_test_prob = classifier.predict_proba(test_X)
    
    pred_oof_prob = classifier.predict_proba(val_X)
    return pred_test_prob, pred_oof_prob

In [50]:
%%time
train_oof_prob, test_pred_prob_list = run_calculations(csr_train, y_train_values, csr_test, 'logreg')

Model training done in 973.0003640651703 seconds.


  np.exp(prob, prob)


Model training done in 942.7906486988068 seconds.
Model training done in 1030.9394733905792 seconds.
Model training done in 825.2373912334442 seconds.
Model training done in 871.2575132846832 seconds.
Wall time: 1h 21min 41s


In [51]:
print('Logistic Regression + вероятности 3х моделей по бинарной классификации')
f1 = f1_score(y_train_values, np.argmax(train_oof_prob, axis=1), average='macro')
precision = precision_score(y_train_values, np.argmax(train_oof_prob, axis=1), average='macro')
recall = recall_score(y_train_values, np.argmax(train_oof_prob, axis=1), average='macro')
print('F1:', f1)
print('Precision', precision)
print('Recall', recall)

Logistic Regression + вероятности 3х моделей по бинарной классификации
F1: 0.9245931659010105
Precision 0.9308536468556395
Recall 0.9195525481308776


In [52]:
train_oof = pd.DataFrame(train_oof_prob)
test_pred = pd.DataFrame(test_pred_prob_list)

train_oof.to_csv(os.path.join(PATH_TO_DATA, 'predictions/logreg_and_binary_preds_train.csv'), index=False)
test_pred.to_csv(os.path.join(PATH_TO_DATA, 'predictions/logreg_and_binary_preds_test.csv'), index=False)