# Cross-Decomposition of TF-IDF Vectors with BiGrams

- Apply to train and test data.
- Retain separate sets of components for titles and descriptions.

# Titles

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
var = 'title'

In [4]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    ngram_range=(1,2),
    min_df=0.000005)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 67138


In [5]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [6]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [7]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
compname = 'idfngram'
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col
    
    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=5)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,5)]
        reduced_train = reduced_train.join(components)
        print('Aggregate cv:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train,y=train.deal_probability))
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,5)]
        reduced_test = reduced_test.join(components)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Decompose aggregate every n steps
        if up_col%(col_step*10) == 0:
            print('Decomposing Aggregate...')
            reduce = cross_decomposition.PLSRegression(n_components=5)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# One final round of decomposition to n components
if reduced_train.shape[1] > 5:
    print('Decomposing Aggregate...')
    reduce = cross_decomposition.PLSRegression(n_components=5)
    reduce.fit(reduced_train,train.deal_probability)
    reduced_train = pd.DataFrame(
        reduce.transform(reduced_train),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
    reduced_test = pd.DataFrame(
        reduce.transform(reduced_test),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
    print('Aggregate cv after decomposition:',model_selection.cross_val_score(
        cv=4,estimator=linear_model.LinearRegression(),
        X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.009183509675312895
Aggregate cv: [0.00505399 0.00456241 0.00495345 0.0049719 ]
Columns: 1500-3000
Prelim score for column range: 0.020489688454986776
Aggregate cv: [0.01907627 0.01772109 0.01797458 0.01845972]
Columns: 3000-4500
Prelim score for column range: 0.009821800559869809
Aggregate cv: [0.02258087 0.02117297 0.02089647 0.02195212]
Columns: 4500-6000
Prelim score for column range: 0.00799019550929092
Aggregate cv: [0.02426916 0.02302251 0.02233436 0.02385494]
Columns: 6000-7500
Prelim score for column range: 0.00797682269629596
Aggregate cv: [0.02591112 0.02443876 0.02357445 0.02554189]
Columns: 7500-9000
Prelim score for column range: 0.008547466768183076
Aggregate cv: [0.02748956 0.02611181 0.02527265 0.02740797]
Columns: 9000-10500
Prelim score for column range: 0.012916359682723089
Aggregate cv: [0.0305337  0.02883387 0.02845827 0.0305729 ]
Columns: 10500-12000
Prelim score for column range: 0.008462633644602868
Aggregate cv: 

---
>**YOU MAY RESTART THE KERNEL AT THIS POINT**
---

# Descriptions

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
var = 'description'

In [4]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    ngram_range=(1,2),
    min_df=0.00005)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 64727


In [5]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [6]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [7]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
compname = 'idfngram'
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col
    
    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=5)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,5)]
        reduced_train = reduced_train.join(components)
        print('Aggregate cv:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train,y=train.deal_probability))
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,5)]
        reduced_test = reduced_test.join(components)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Decompose aggregate every n steps
        if up_col%(col_step*10) == 0:
            print('Decomposing Aggregate...')
            reduce = cross_decomposition.PLSRegression(n_components=5)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# One final round of decomposition to n components
if reduced_train.shape[1] > 5:
    print('Decomposing Aggregate...')
    reduce = cross_decomposition.PLSRegression(n_components=5)
    reduce.fit(reduced_train,train.deal_probability)
    reduced_train = pd.DataFrame(
        reduce.transform(reduced_train),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
    reduced_test = pd.DataFrame(
        reduce.transform(reduced_test),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
    print('Aggregate cv after decomposition:',model_selection.cross_val_score(
        cv=4,estimator=linear_model.LinearRegression(),
        X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.01596561611580327
Aggregate cv: [0.00961727 0.00979394 0.01005502 0.01009222]
Columns: 1500-3000
Prelim score for column range: 0.016929697326530047
Aggregate cv: [0.0194269  0.01890985 0.01950827 0.01975085]
Columns: 3000-4500
Prelim score for column range: 0.012037765672628264
Aggregate cv: [0.02341009 0.02303537 0.02318495 0.02416345]
Columns: 4500-6000
Prelim score for column range: 0.011475811444744853
Aggregate cv: [0.02598355 0.02583479 0.02544778 0.02682626]
Columns: 6000-7500
Prelim score for column range: 0.02646974533710711
Aggregate cv: [0.04077638 0.03936563 0.0406893  0.04193133]
Columns: 7500-9000
Prelim score for column range: 0.02310270429387984
Aggregate cv: [0.05109357 0.04957796 0.05046476 0.05235976]
Columns: 9000-10500
Prelim score for column range: 0.02320084077895901
Aggregate cv: [0.05817159 0.0564372  0.05811968 0.05922264]
Columns: 10500-12000
Prelim score for column range: 0.021032878066735594
Aggregate cv: [0