# Binary PLSR

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
import re
%matplotlib inline

In [2]:
train = pd.read_csv('../../../train.csv.zip',compression='zip')
test = pd.read_csv('../../../test.csv.zip',compression='zip')

In [7]:
variables = ['param_1','param_2','param_3','region','city','category_name']
compnames = ['p1plsr','p2plsr','p3plsr','regplsr','cityplsr','catnamplsr'] # custom name for component

In [49]:
t = time.time()

reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index) 

for var,compname in zip(variables,compnames):
    print(var,compname)
    def joiner(string):
        pattern = "[\[].,/-# <>`~+!*?[\]]"
        string = string.replace(' ','_').replace(',','_')
        return re.sub(pattern, "_", string)

    vec = feature_extraction.text.CountVectorizer(
        lowercase=True,
        binary=True,
        preprocessor=joiner)
    # Fitting on train and test as merged lists
    vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
    print('N tokens:',len(vec.get_feature_names())) 
    
    counts_train = vec.transform(train[var].astype(str).tolist())
    counts_test = vec.transform(test[var].astype(str).tolist())

    # Reduce all CSR values in batches
    start_col = 0
    varname = var[:5]
    n_comp = 10
    ##########################################
    n_cols = counts_train.shape[1]
    col_step = 1000
    col_end = n_cols + col_step
    ##########################################
    # Start iteration with columns
    low_col = start_col
    for col in np.arange(0,col_end,col_step):
        # Limiting the edge case of the last values
        if col > n_cols:
            col = n_cols
        up_col = col

        if up_col > low_col:
            ###########################################
            # Train PLSR on a large sample of train vectors
            print('Columns: {}-{}'.format(low_col,up_col))
            index = np.random.choice(len(train),size=int(4e5))
            sample = counts_train[index,low_col:up_col].toarray()
            reduce = cross_decomposition.PLSRegression(n_components=n_comp)
            reduce.fit(sample,train.iloc[index].deal_probability)
            print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
            ##########################################
            # (TRAIN) Nested indexes iteration
            # Initial values:
            n_rows = len(train)
            row_step = int(2.5e5)
            row_end = n_rows + row_step
            components = pd.DataFrame()
            low_idx = 0
            ###########
            for idx in np.arange(0,row_end,row_step):
                # Limiting the edge case of the last values
                if idx > n_rows:
                    idx = n_rows
                up_idx = idx

                if up_idx > low_idx:
                    sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                    sample = reduce.transform(sample)
                    components = components.append(pd.DataFrame(sample))
                    low_idx = idx
            components.reset_index(drop=True,inplace=True)
            components.columns = ['{}_col_{}-{}_{}'.format(compname,low_col,up_col,i) for i in range(0,n_comp)]

            # Cross-validate and check for corruptions before joining
            cv = model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train.join(components),y=train.deal_probability)
            print('Aggregate cv:',cv) 
            
            reduced_train = reduced_train.join(components)
            ###########################################
            # (TEST) Nested indexes iteration
            # Initial values:
            n_rows = len(test)
            row_step = int(2e5)
            row_end = n_rows + row_step
            components = pd.DataFrame()
            low_idx = 0
            ###########
            for idx in np.arange(0,row_end,row_step):
                if idx > n_rows:
                    idx = n_rows
                up_idx = idx

                if up_idx > low_idx:
                    sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                    sample = reduce.transform(sample)
                    components = components.append(pd.DataFrame(sample))
                    low_idx = idx
            components.reset_index(drop=True,inplace=True)
            components.columns = ['{}_col_{}-{}_{}'.format(compname,low_col,up_col,i) for i in range(0,n_comp)]
            reduced_test = reduced_test.join(components)
            print(reduced_train.shape,reduced_test.shape)
            #####################################
            # Prepare for next column range
            low_col = col     
            #####################################    
# Last save to disk
joblib.dump(reduced_train,'train_categorical.sav')
joblib.dump(reduced_test,'test_categorical.sav')
#########
print('Minutes:',(time.time()-t)/60,'\n')

param_1 p1plsr
N tokens: 387
Columns: 0-387
Prelim score for column range: 0.15655498110674948
Aggregate cv: [0.15676329 0.15322856 0.15637188 0.15657073]
(1503424, 10) (508438, 10)
param_2 p2plsr
N tokens: 280
Columns: 0-280
Prelim score for column range: 0.12558359907063832
Aggregate cv: [0.16264688 0.15924979 0.16186965 0.16246216]
(1503424, 20) (508438, 20)
param_3 p3plsr
N tokens: 1269
Columns: 0-1000
Prelim score for column range: 0.057518731333971
Aggregate cv: [0.16334192 0.15999341 0.16264773 0.16316857]
(1503424, 30) (508438, 30)
Columns: 1000-1269
Prelim score for column range: 0.0318208386201323
Aggregate cv: [0.16356205 0.16024803 0.16291684 0.16344107]
(1503424, 40) (508438, 40)
region regplsr
N tokens: 29
Columns: 0-29
Prelim score for column range: 0.0013247104814475552
Aggregate cv: [0.16397374 0.16063732 0.16330568 0.16378928]
(1503424, 50) (508438, 50)
city cityplsr
N tokens: 1777
Columns: 0-1000
Prelim score for column range: 0.004507537758539626
Aggregate cv: [0.16

In [52]:
y = train.deal_probability
X_dev,X_val,y_dev,y_val = model_selection.train_test_split(reduced_train,y)

def rmse(y,pred):
    return metrics.mean_squared_error(y,pred)**0.5

score = metrics.make_scorer(rmse)
linear = linear_model.LinearRegression()

cv = model_selection.cross_val_score(
    X=X_dev,y=y_dev,estimator=linear,
    cv=5,scoring=score
)
print('CV Scores:\n',cv)
print('Mean CV score:\n',cv.mean())

linear = linear.fit(X_dev,y_dev)

pred = linear.predict(X_val)
print('Less than zero:',sum(pred<0))
print('Over one:',sum(pred>1))
print('RMSE without modification:',metrics.mean_squared_error(y_val,pred)**0.5)
pred[pred>1] = 1
pred[pred<0] = 0
print('RMSE fit-to-range:',metrics.mean_squared_error(y_val,pred)**0.5)

print('TEST preds:')
pred = linear.predict(reduced_train)
print('Less than zero:',sum(pred<0))
print('Over one:',sum(pred>1))

CV Scores:
 [0.23745429 0.23745653 0.2377352  0.23809592 0.23817744]
Mean CV score:
 0.23778387528549189
Less than zero: 1121
Over one: 0
RMSE without modification: 0.23746276229960936
RMSE fit-to-range: 0.23745955927295329
TEST preds:
Less than zero: 4574
Over one: 0


# Popularity Encoder

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
import re
%matplotlib inline

In [2]:
train = pd.read_csv('../../../train.csv.zip',compression='zip')
test = pd.read_csv('../../../test.csv.zip',compression='zip')

In [447]:
variables = ['param_1','param_2','param_3','region','city','parent_category_name','category_name']

train_codes = pd.DataFrame()
test_codes = pd.DataFrame()

for var in variables:
    # Target-sorted unique values of train&test, no empties, no weird types.
    train_values = train[[var]].fillna('Blank').astype(str).join(train['deal_probability']).groupby(var).mean().sort_values('deal_probability',ascending=False).reset_index()
    # If values are in test only, fill with median
    med = train_values.median()[0]
    train_unique = train_values[var]
    test_unique = pd.Series(test[[var]].fillna('Blank').astype(str)[var].unique())
    miss_idx = [i[0] for i in np.argwhere([unique not in train_unique.tolist() for unique in test_unique])]
    for idx in miss_idx:
        new = pd.DataFrame({var:test_unique[idx],'deal_probability':med},index=[len(train_values)])
        train_values = train_values.append(new)
    train_values = train_values.sort_values('deal_probability',ascending=False).reset_index(drop=True)
    values = train_values[var].tolist()
        
    # Dict: Keys are sorted-uniques, dict-values are indices.
    val_dict = {}
    for i,val in enumerate(values):
        val_dict[val] = i
    
    # Train codes ###########
    codes = []
    for val in train[var].fillna('Blank').astype(str):
        codes.append(val_dict[val])
    train_codes['code_'+str(var)] = codes
    
    # Test codes ############
    codes = []
    for val in test[var].fillna('Blank').astype(str):
        codes.append(val_dict[val])
    test_codes['code_'+str(var)] = codes

In [450]:
joblib.dump(train_codes,'train_codes.sav')
joblib.dump(test_codes,'test_codes.sav')

['test_codes.sav']

train_f = joblib.load('feature_dumps1/train_codes.sav')

test_f = joblib.load('feature_dumps1/test_codes.sav')

In [43]:
y = train.deal_probability
X_dev,X_val,y_dev,y_val = model_selection.train_test_split(train_f,y)

def rmse(y,pred):
    return metrics.mean_squared_error(y,pred)**0.5

score = metrics.make_scorer(rmse)
linear = linear_model.LinearRegression()

cv = model_selection.cross_val_score(
    X=X_dev,y=y_dev,estimator=linear,
    cv=5,scoring=score
)
print('CV Scores:\n',cv)
print('Mean CV score:\n',cv.mean())

linear = linear.fit(X_dev,y_dev)

pred = linear.predict(X_val)
print('Less than zero:',sum(pred<0))
print('Over one:',sum(pred>1))
print('RMSE without modification:',metrics.mean_squared_error(y_val,pred)**0.5)
pred[pred>1] = 1
pred[pred<0] = 0
print('RMSE fit-to-range:',metrics.mean_squared_error(y_val,pred)**0.5)

print('TEST preds:')
pred = linear.predict(test_f)
print('Less than zero:',sum(pred<0))
print('Over one:',sum(pred>1))

CV Scores:
 [0.23893316 0.23920064 0.23952821 0.23933496 0.23907038]
Mean CV score:
 0.23921346870628413
Less than zero: 672
Over one: 0
RMSE without modification: 0.2386217698214643
RMSE fit-to-range: 0.23862049461858315
TEST preds:
Less than zero: 673
Over one: 0
