# Binary PLSR

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
import re
%matplotlib inline

In [2]:
train = pd.read_csv('../../../train.csv.zip',compression='zip')
test = pd.read_csv('../../../test.csv.zip',compression='zip')

In [3]:
variables = ['param_1','param_2','param_3','region','city','parent_category_name','category_name']
compnames = ['p1plsr','p2plsr','p3plsr','regplsr','cityplsr','ptcatplsr','catnamplsr'] # custom name for component

In [4]:
for var,compname in zip(variables,compnames):
    print(var,compname)
    def joiner(string):
        pattern = "[\[].,/-# <>`~+!*?[\]]"
        string = string.replace(' ','_').replace(',','_')
        return re.sub(pattern, "", string)

    vec = feature_extraction.text.CountVectorizer(
        lowercase=True,
        binary=True,
        preprocessor=joiner)
    # Fitting on train and test as merged lists
    vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
    print('N tokens:',len(vec.get_feature_names())) 
    
    counts_train = vec.transform(train[var].astype(str).tolist())
    counts_test = vec.transform(test[var].astype(str).tolist())

    reduced_train = pd.DataFrame(index=train.index)
    reduced_test = pd.DataFrame(index=test.index) 
 
    # Reduce all CSR values in batches
    t = time.time()
    start_col = 0
    varname = var[:5]
    n_comp = 3
    ##########################################
    n_cols = counts_train.shape[1]
    col_step = 1500
    col_end = n_cols + col_step
    ##########################################
    # Start iteration with columns
    low_col = start_col
    corrupted = False
    for col in np.arange(0,col_end,col_step):
        # Limiting the edge case of the last values
        if col > n_cols:
            col = n_cols
        up_col = col

        if up_col > low_col:
            ###########################################
            # Train PLSR on a large sample of train vectors
            print('Columns: {}-{}'.format(low_col,up_col))
            index = np.random.choice(len(train),size=int(4e5))
            sample = counts_train[index,low_col:up_col].toarray()
            reduce = cross_decomposition.PLSRegression(n_components=n_comp)
            reduce.fit(sample,train.iloc[index].deal_probability)
            print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
            ##########################################
            # (TRAIN) Nested indexes iteration
            # Initial values:
            n_rows = len(train)
            row_step = int(2.5e5)
            row_end = n_rows + row_step
            components = pd.DataFrame()
            low_idx = 0
            ###########
            for idx in np.arange(0,row_end,row_step):
                # Limiting the edge case of the last values
                if idx > n_rows:
                    idx = n_rows
                up_idx = idx

                if up_idx > low_idx:
                    sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                    sample = reduce.transform(sample)
                    components = components.append(pd.DataFrame(sample))
                    low_idx = idx
            components.reset_index(drop=True,inplace=True)
            components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,n_comp)]

            # Cross-validate and check for corruptions before joining
            cv = model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train.join(components),y=train.deal_probability)

            print('Aggregate cv:',cv)        

            if sum([score < 0 for score in cv]) > 0:
                print('Reached corruption.\n Final decomposition without joining...')
                print(reduced_train.shape,reduced_test.shape)
                reduce = cross_decomposition.PLSRegression(n_components=n_comp)
                reduce.fit(reduced_train,train.deal_probability)
                reduced_train = pd.DataFrame(
                    reduce.transform(reduced_train),
                    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,n_comp)])
                reduced_test = pd.DataFrame(
                    reduce.transform(reduced_test),
                    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,n_comp)])
                print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                    cv=4,estimator=linear_model.LinearRegression(),
                    X=reduced_train,y=train.deal_probability))
                print('Minutes:',(time.time()-t)/60)
                break

            # Join if it wasn't corrupted..    
            reduced_train = reduced_train.join(components)
            ###########################################
            # (TEST) Nested indexes iteration
            # Initial values:
            n_rows = len(test)
            row_step = int(2e5)
            row_end = n_rows + row_step
            components = pd.DataFrame()
            low_idx = 0
            ###########
            for idx in np.arange(0,row_end,row_step):
                if idx > n_rows:
                    idx = n_rows
                up_idx = idx

                if up_idx > low_idx:
                    sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                    sample = reduce.transform(sample)
                    components = components.append(pd.DataFrame(sample))
                    low_idx = idx
            components.reset_index(drop=True,inplace=True)
            components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,n_comp)]
            reduced_test = reduced_test.join(components)
            print(reduced_train.shape,reduced_test.shape)
            #####################################
            # Prepare for next column range
            low_col = col     
            #####################################    
            # Interval decompositions
            if up_col%(col_step*10) == 0:
                print('Interval decomposition...')
                print(reduced_train.shape,reduced_test.shape)
                reduce = cross_decomposition.PLSRegression(n_components=n_comp)
                reduce.fit(reduced_train,train.deal_probability)
                reduced_train = pd.DataFrame(
                    reduce.transform(reduced_train),
                    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,n_comp)])
                reduced_test = pd.DataFrame(
                    reduce.transform(reduced_test),
                    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,n_comp)])
                print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                    cv=4,estimator=linear_model.LinearRegression(),
                    X=reduced_train,y=train.deal_probability))
            #####################################    
            # Save progress every n steps
            if up_col%(col_step*5) == 0:
                print('Interval save to disk...')
                joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
                joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
    #####################################
    # Final round of decomposition
    print('Final decomposition...')
    reduce = cross_decomposition.PLSRegression(n_components=n_comp)
    reduce.fit(reduced_train,train.deal_probability)
    reduced_train = pd.DataFrame(
        reduce.transform(reduced_train),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,n_comp)])
    reduced_test = pd.DataFrame(
        reduce.transform(reduced_test),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,n_comp)])
    print('Aggregate cv after decomposition:',model_selection.cross_val_score(
        cv=4,estimator=linear_model.LinearRegression(),
        X=reduced_train,y=train.deal_probability))
    #########
    # Last save to disk
    joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
    joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
    #########
    print('Minutes:',(time.time()-t)/60)

param_1 p1plsr
N tokens: 387
Columns: 0-387
Prelim score for column range: 0.1585976928629911
Aggregate cv: [0.15679294 0.15318207 0.15617345 0.15660435]
(1503424, 3) (508438, 3)
Final decomposition...
Aggregate cv after decomposition: [0.15679294 0.15318207 0.15617345 0.15660435]
Minutes: 0.2599694490432739
param_2 p2plsr
N tokens: 280
Columns: 0-280
Prelim score for column range: 0.1236005744489529
Aggregate cv: [0.12185657 0.11980314 0.1214753  0.12158728]
(1503424, 3) (508438, 3)
Final decomposition...
Aggregate cv after decomposition: [0.12185657 0.11980314 0.1214753  0.12158728]
Minutes: 0.2080145796140035
param_3 p3plsr
N tokens: 1269
Columns: 0-1269
Prelim score for column range: 0.06571525287454738
Aggregate cv: [0.06351728 0.06121546 0.06336003 0.06263238]
(1503424, 3) (508438, 3)
Final decomposition...
Aggregate cv after decomposition: [0.06351728 0.06121546 0.06336003 0.06263238]
Minutes: 0.7164393464724222
region regplsr
N tokens: 29
Columns: 0-29
Prelim score for column r

# Popularity Encoder

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
import re
%matplotlib inline

In [2]:
train = pd.read_csv('../../../train.csv.zip',compression='zip')
test = pd.read_csv('../../../test.csv.zip',compression='zip')

In [447]:
variables = ['param_1','param_2','param_3','region','city','parent_category_name','category_name']

train_codes = pd.DataFrame()
test_codes = pd.DataFrame()

for var in variables:
    # Target-sorted unique values of train&test, no empties, no weird types.
    train_values = train[[var]].fillna('Blank').astype(str).join(train['deal_probability']).groupby(var).mean().sort_values('deal_probability',ascending=False).reset_index()
    # If values are in test only, fill with median
    med = train_values.median()[0]
    train_unique = train_values[var]
    test_unique = pd.Series(test[[var]].fillna('Blank').astype(str)[var].unique())
    miss_idx = [i[0] for i in np.argwhere([unique not in train_unique.tolist() for unique in test_unique])]
    for idx in miss_idx:
        new = pd.DataFrame({var:test_unique[idx],'deal_probability':med},index=[len(train_values)])
        train_values = train_values.append(new)
    train_values = train_values.sort_values('deal_probability',ascending=False).reset_index(drop=True)
    values = train_values[var].tolist()
        
    # Dict: Keys are sorted-uniques, dict-values are indices.
    val_dict = {}
    for i,val in enumerate(values):
        val_dict[val] = i
    
    # Train codes ###########
    codes = []
    for val in train[var].fillna('Blank').astype(str):
        codes.append(val_dict[val])
    train_codes['code_'+str(var)] = codes
    
    # Test codes ############
    codes = []
    for val in test[var].fillna('Blank').astype(str):
        codes.append(val_dict[val])
    test_codes['code_'+str(var)] = codes

In [450]:
joblib.dump(train_codes,'train_codes.sav')
joblib.dump(test_codes,'test_codes.sav')

['test_codes.sav']