# Get Vocabularies for Target Ranges

- Include Count and IDF vocabularies.
- Get vocabs for titles and descriptions.

# Title Vocabs

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [4]:
# Make count and idf vocabularies for target ranges on a given variable
var = 'title'
#####################################
upper_str = ' '.join(train.loc[i_up,var].astype(str).values)
lower_str = ' '.join(train.loc[i_low,var].astype(str).values)
zeroes_str = ' '.join(train.loc[i_0,var].astype(str).values)

# Get dictionaries from both count and idf vectorizers.
count = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False)
idf = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False)

vecs ={'count':count,'idf':idf}
#####################
for key in vecs:
    vec = vecs[key]
    vec.fit([zeroes_str,lower_str,upper_str])
    counts = vec.transform([zeroes_str,lower_str,upper_str])

    # Convert CSR into DataFrame and Transpose. Now terms are on the index
    counts = pd.DataFrame(counts.toarray()).T

    # Extract terms from vocabulary, sort by index and add to df index
    vocab = vec.vocabulary_
    terms = [f for f in vocab]
    terms = pd.DataFrame(terms)
    terms['index'] = [vocab[k] for k in vocab]
    terms.sort_values(by='index',inplace=True)
    terms = terms[0].values.tolist()
    counts.index = terms
    
    # Make an indicator of where the highest frequency is for each term
    group = []
    for i in np.arange(len(counts)):
        group.append(np.argmax(counts.iloc[i].values))      
    counts['group'] = group
    
    zero_vocab = counts[counts.group == 0].sort_values(by=0,ascending=False).index.tolist()
    lower_vocab = counts[counts.group == 1].sort_values(by=1,ascending=False).index.tolist()
    upper_vocab = counts[counts.group == 2].sort_values(by=2,ascending=False).index.tolist()

    vocabs = [zero_vocab,lower_vocab,upper_vocab]
    vocabs = pd.DataFrame(vocabs)
    vocabs = vocabs.T
    vocabs.columns=['zero_voc','low_voc','up_voc']
    
    vocabs.to_pickle('{}_vocabs_{}.pkl'.format(var[:5],key))

---
> RESTART KERNEL
---

# Description Vocabs

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [8]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [9]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [11]:
# Make count and idf vocabularies for target ranges on a given variable
var = 'description'
#####################################
upper_str = ' '.join(train.loc[i_up,var].astype(str).values)
lower_str = ' '.join(train.loc[i_low,var].astype(str).values)
zeroes_str = ' '.join(train.loc[i_0,var].astype(str).values)

# Get dictionaries from both count and idf vectorizers.
count = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False)
idf = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False)

vecs ={'count':count,'idf':idf}
#####################
for key in vecs:
    vec = vecs[key]
    vec.fit([zeroes_str,lower_str,upper_str])
    counts = vec.transform([zeroes_str,lower_str,upper_str])

    # Convert CSR into DataFrame and Transpose. Now terms are on the index
    counts = pd.DataFrame(counts.toarray()).T

    # Extract terms from vocabulary, sort by index and add to df index
    vocab = vec.vocabulary_
    terms = [f for f in vocab]
    terms = pd.DataFrame(terms)
    terms['index'] = [vocab[k] for k in vocab]
    terms.sort_values(by='index',inplace=True)
    terms = terms[0].values.tolist()
    counts.index = terms
    
    # Make an indicator of where the highest frequency is for each term
    group = []
    for i in np.arange(len(counts)):
        group.append(np.argmax(counts.iloc[i].values))      
    counts['group'] = group
    
    zero_vocab = counts[counts.group == 0].sort_values(by=0,ascending=False).index.tolist()
    lower_vocab = counts[counts.group == 1].sort_values(by=1,ascending=False).index.tolist()
    upper_vocab = counts[counts.group == 2].sort_values(by=2,ascending=False).index.tolist()

    vocabs = [zero_vocab,lower_vocab,upper_vocab]
    vocabs = pd.DataFrame(vocabs)
    vocabs = vocabs.T
    vocabs.columns=['zero_voc','low_voc','up_voc']
    
    vocabs.to_pickle('{}_vocabs_{}.pkl'.format(var[:5],key))

---
> RESTART THE KERNEL
---

# Cross-Decomposition per Vocab

I can only decompose one vocabulary at a time, for memory limitation reasons. So every time define:
- the variable
- the vocabulary kind
- the target range it represents.

## Title var- Count voc- Zero Range

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../../train.pkl',compression='zip')
test = pd.read_pickle('../../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [4]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_0 # pick from above cell
rnge = 'zero' # zero, low or up
voc_kind = 'count' # count or idf
compname = 'zerocnt' # custom name for component

In [5]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [6]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 67000


In [7]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [8]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [10]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
corrupted = False
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col

    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=2)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]

        # Cross-validate and check for corruptions before joining
        cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components),y=train.deal_probability)

        print('Aggregate cv:',cv)
        print('Aggregate score for {} range:'.format(rnge),model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components).iloc[irange],y=train.iloc[irange].deal_probability))
        
        
        if sum([score < 0 for score in cv]) > 0:
            print('Reached corruption.\n Final decomposition without joining...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Minutes:',(time.time()-t)/60)
            break
        
        # Join if it wasn't corrupted..    
        reduced_train = reduced_train.join(components)
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]
        reduced_test = reduced_test.join(components)
        print(reduced_train.shape,reduced_test.shape)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Interval decompositions
        if up_col%(col_step*10) == 0:
            print('Interval decomposition...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            print('Interval save to disk...')
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# Final round of decomposition
print('Final decomposition...')
reduce = cross_decomposition.PLSRegression(n_components=2)
reduce.fit(reduced_train,train.deal_probability)
reduced_train = pd.DataFrame(
    reduce.transform(reduced_train),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
reduced_test = pd.DataFrame(
    reduce.transform(reduced_test),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
print('Aggregate cv after decomposition:',model_selection.cross_val_score(
    cv=4,estimator=linear_model.LinearRegression(),
    X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.056647761693727094
Aggregate cv: [0.0437396  0.04204362 0.04367361 0.04482865]
Aggregate score for up range: [0.08124249 0.07623606 0.07767721 0.07850478]
(1503424, 0) (508438, 0)
Columns: 1500-3000
Prelim score for column range: 0.011968047589463904
Aggregate cv: [0.04687835 0.0450409  0.04649784 0.04759228]
Aggregate score for up range: [0.08203729 0.07668296 0.07864323 0.07990906]
(1503424, 2) (508438, 2)
Columns: 3000-4500
Prelim score for column range: 0.00641642638614326
Aggregate cv: [0.04818664 0.04643281 0.04793703 0.04907903]
Aggregate score for up range: [0.0824903  0.07709868 0.07954522 0.08001705]
(1503424, 4) (508438, 4)
Columns: 4500-6000
Prelim score for column range: 0.007745084219598074
Aggregate cv: [0.04973886 0.04806677 0.049751   0.05075626]
Aggregate score for up range: [0.08318783 0.07793074 0.08104477 0.0807392 ]
(1503424, 6) (508438, 6)
Columns: 6000-7500
Prelim score for column range: 0.0067777154659836025
Aggr

---
> RESTART KERNEL
---

## Title var- Count voc- Lower Range

Learning from the corruption in the previous iteration, I've eliminated the interval decompositions and the final decomposition, while reducing the number of components to 2 per column step. This will allow the freedom to trim corrupted components before doing a final decomposition.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../../train.pkl',compression='zip')
test = pd.read_pickle('../../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [4]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_low # pick from above cell
rnge = 'low' # zero, low or up
voc_kind = 'count' # count or idf
compname = 'lowcnt' # custom name for component

In [5]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [6]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 41218


In [7]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [8]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [10]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
corrupted = False
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col

    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=2)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]

        # Cross-validate and check for corruptions before joining
        cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components),y=train.deal_probability)

        print('Aggregate cv:',cv)
        print('Aggregate score for {} range:'.format(rnge),model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components).iloc[irange],y=train.iloc[irange].deal_probability))
        
        
        if sum([score < 0 for score in cv]) > 0:
            print('Reached corruption.\n Final decomposition without joining...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Minutes:',(time.time()-t)/60)
            break
        
        # Join if it wasn't corrupted..    
        reduced_train = reduced_train.join(components)
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]
        reduced_test = reduced_test.join(components)
        print(reduced_train.shape,reduced_test.shape)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Interval decompositions
        if up_col%(col_step*10) == 0:
            print('Interval decomposition...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            print('Interval save to disk...')
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# Final round of decomposition
print('Final decomposition...')
reduce = cross_decomposition.PLSRegression(n_components=2)
reduce.fit(reduced_train,train.deal_probability)
reduced_train = pd.DataFrame(
    reduce.transform(reduced_train),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
reduced_test = pd.DataFrame(
    reduce.transform(reduced_test),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
print('Aggregate cv after decomposition:',model_selection.cross_val_score(
    cv=4,estimator=linear_model.LinearRegression(),
    X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.056647761693727094
Aggregate cv: [0.0437396  0.04204362 0.04367361 0.04482865]
Aggregate score for up range: [0.08124249 0.07623606 0.07767721 0.07850478]
(1503424, 0) (508438, 0)
Columns: 1500-3000
Prelim score for column range: 0.011968047589463904
Aggregate cv: [0.04687835 0.0450409  0.04649784 0.04759228]
Aggregate score for up range: [0.08203729 0.07668296 0.07864323 0.07990906]
(1503424, 2) (508438, 2)
Columns: 3000-4500
Prelim score for column range: 0.00641642638614326
Aggregate cv: [0.04818664 0.04643281 0.04793703 0.04907903]
Aggregate score for up range: [0.0824903  0.07709868 0.07954522 0.08001705]
(1503424, 4) (508438, 4)
Columns: 4500-6000
Prelim score for column range: 0.007745084219598074
Aggregate cv: [0.04973886 0.04806677 0.049751   0.05075626]
Aggregate score for up range: [0.08318783 0.07793074 0.08104477 0.0807392 ]
(1503424, 6) (508438, 6)
Columns: 6000-7500
Prelim score for column range: 0.0067777154659836025
Aggr

---
> RESTART THE KERNEL
---

## Title var- Count voc- Upper Range

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [3]:
train = pd.read_pickle('../../../train.pkl',compression='zip')
test = pd.read_pickle('../../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [4]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [5]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_up # pick from above cell
rnge = 'up' # zero, low or up
voc_kind = 'count' # count or idf
compname = 'upcnt' # custom name for component

In [6]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [7]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 16262


In [8]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [9]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [None]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
corrupted = False
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col

    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=2)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]

        # Cross-validate and check for corruptions before joining
        cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components),y=train.deal_probability)

        print('Aggregate cv:',cv)
        print('Aggregate score for {} range:'.format(rnge),model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components).iloc[irange],y=train.iloc[irange].deal_probability))
        
        
        if sum([score < 0 for score in cv]) > 0:
            print('Reached corruption.\n Final decomposition without joining...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Minutes:',(time.time()-t)/60)
            break
        
        # Join if it wasn't corrupted..    
        reduced_train = reduced_train.join(components)
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]
        reduced_test = reduced_test.join(components)
        print(reduced_train.shape,reduced_test.shape)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Interval decompositions
        if up_col%(col_step*10) == 0:
            print('Interval decomposition...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            print('Interval save to disk...')
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# Final round of decomposition
print('Final decomposition...')
reduce = cross_decomposition.PLSRegression(n_components=2)
reduce.fit(reduced_train,train.deal_probability)
reduced_train = pd.DataFrame(
    reduce.transform(reduced_train),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
reduced_test = pd.DataFrame(
    reduce.transform(reduced_test),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
print('Aggregate cv after decomposition:',model_selection.cross_val_score(
    cv=4,estimator=linear_model.LinearRegression(),
    X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

---
> RESTART THE KERNEL
---

## Title var- IDF voc- Zero Range

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../../train.pkl',compression='zip')
test = pd.read_pickle('../../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [4]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_0 # pick from above cell
rnge = 'zero' # zero, low or up
voc_kind = 'idf' # count or idf
compname = 'zeroidf' # custom name for component

In [5]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [6]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 67000


In [7]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [8]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [9]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
corrupted = False
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col

    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=2)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]

        # Cross-validate and check for corruptions before joining
        cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components),y=train.deal_probability)

        print('Aggregate cv:',cv)
        print('Aggregate score for {} range:'.format(rnge),model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components).iloc[irange],y=train.iloc[irange].deal_probability))
        
        
        if sum([score < 0 for score in cv]) > 0:
            print('Reached corruption.\n Final decomposition without joining...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Minutes:',(time.time()-t)/60)
            break
        
        # Join if it wasn't corrupted..    
        reduced_train = reduced_train.join(components)
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]
        reduced_test = reduced_test.join(components)
        print(reduced_train.shape,reduced_test.shape)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Interval decompositions
        if up_col%(col_step*10) == 0:
            print('Interval decomposition...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            print('Interval save to disk...')
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# Final round of decomposition
print('Final decomposition...')
reduce = cross_decomposition.PLSRegression(n_components=2)
reduce.fit(reduced_train,train.deal_probability)
reduced_train = pd.DataFrame(
    reduce.transform(reduced_train),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
reduced_test = pd.DataFrame(
    reduce.transform(reduced_test),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
print('Aggregate cv after decomposition:',model_selection.cross_val_score(
    cv=4,estimator=linear_model.LinearRegression(),
    X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.0767375171127994
Aggregate cv: [0.07237674 0.06944764 0.07174497 0.07169012]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 2) (508438, 2)
Columns: 1500-3000
Prelim score for column range: 0.009651776421301548
Aggregate cv: [0.07873665 0.07578788 0.07831682 0.07825956]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 4) (508438, 4)
Columns: 3000-4500
Prelim score for column range: 0.00703418889581886
Aggregate cv: [0.08269613 0.0796202  0.08203387 0.08189988]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 6) (508438, 6)
Columns: 4500-6000
Prelim score for column range: 0.005639163197510787
Aggregate cv: [0.08480908 0.08165625 0.08423592 0.08424105]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 8) (508438, 8)
Columns: 6000-7500
Prelim score for column range: 0.004841451820319587
Aggregate cv: [0.08648068 0.08319056 0.08568377 0.08581158]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 10) (508438

Prelim score for column range: 0.0005509756879108485
Aggregate cv: [0.10360386 0.10016776 0.10291351 0.10263695]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 18) (508438, 18)
Columns: 57000-58500
Prelim score for column range: 0.0005654461256056065
Aggregate cv: [0.10388842 0.10042094 0.10320837 0.10291276]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 20) (508438, 20)
Columns: 58500-60000
Prelim score for column range: 0.0006925368200940696
Aggregate cv: [0.10397295 0.1005211  0.10331266 0.10298287]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 22) (508438, 22)
Interval decomposition...
(1503424, 22) (508438, 22)
Aggregate cv after decomposition: [0.10397968 0.10052284 0.10331499 0.10298807]
Interval save to disk...
Columns: 60000-61500
Prelim score for column range: 0.0006882795972371626
Aggregate cv: [0.10406794 0.10061817 0.10338086 0.10305505]
Aggregate score for zero range: [1. 1. 1. 1.]
(1503424, 4) (508438, 4)
Columns: 61500-63000
Prelim score for

---
> RESTART THE KERNEL
---

## Title var- IDF voc- Lower Range

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../../train.pkl',compression='zip')
test = pd.read_pickle('../../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [4]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_low # pick from above cell
rnge = 'low' # zero, low or up
voc_kind = 'idf' # count or idf
compname = 'lowidf' # custom name for component

In [5]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [6]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 35363


In [7]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [8]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [9]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
corrupted = False
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col

    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=2)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]

        # Cross-validate and check for corruptions before joining
        cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components),y=train.deal_probability)

        print('Aggregate cv:',cv)
        print('Aggregate score for {} range:'.format(rnge),model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components).iloc[irange],y=train.iloc[irange].deal_probability))
        
        
        if sum([score < 0 for score in cv]) > 0:
            print('Reached corruption.\n Final decomposition without joining...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Minutes:',(time.time()-t)/60)
            break
        
        # Join if it wasn't corrupted..    
        reduced_train = reduced_train.join(components)
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]
        reduced_test = reduced_test.join(components)
        print(reduced_train.shape,reduced_test.shape)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Interval decompositions
        if up_col%(col_step*10) == 0:
            print('Interval decomposition...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            print('Interval save to disk...')
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# Final round of decomposition
print('Final decomposition...')
reduce = cross_decomposition.PLSRegression(n_components=2)
reduce.fit(reduced_train,train.deal_probability)
reduced_train = pd.DataFrame(
    reduce.transform(reduced_train),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
reduced_test = pd.DataFrame(
    reduce.transform(reduced_test),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
print('Aggregate cv after decomposition:',model_selection.cross_val_score(
    cv=4,estimator=linear_model.LinearRegression(),
    X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.005390740582564391
Aggregate cv: [0.0026094  0.00277017 0.00251395 0.00236   ]
Aggregate score for low range: [0.00591272 0.00544688 0.00489253 0.00374499]
(1503424, 2) (508438, 2)
Columns: 1500-3000
Prelim score for column range: 0.0010543684435517786
Aggregate cv: [0.00275112 0.00280599 0.00263428 0.00246088]
Aggregate score for low range: [0.00766209 0.00633327 0.00639631 0.00487442]
(1503424, 4) (508438, 4)
Columns: 3000-4500
Prelim score for column range: 0.000679906829489374
Aggregate cv: [0.00294102 0.00296647 0.00279493 0.0026183 ]
Aggregate score for low range: [0.00917883 0.00750882 0.00759257 0.00604975]
(1503424, 6) (508438, 6)
Columns: 4500-6000
Prelim score for column range: 0.0007005460428868293
Aggregate cv: [0.00294864 0.00301088 0.0028186  0.00265559]
Aggregate score for low range: [0.00940135 0.00839502 0.00830197 0.0068365 ]
(1503424, 8) (508438, 8)
Columns: 6000-7500
Prelim score for column range: 0.00036082500646450

---
> RESTART THE KERNEL
---

## Title var- IDF voc- Upper Range

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../../train.pkl',compression='zip')
test = pd.read_pickle('../../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [5]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_low # pick from above cell
rnge = 'up' # zero, low or up
voc_kind = 'idf' # count or idf
compname = 'upidf' # custom name for component

In [6]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [7]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 44922


In [8]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [9]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [10]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
corrupted = False
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col

    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=2)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]

        # Cross-validate and check for corruptions before joining
        cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components),y=train.deal_probability)

        print('Aggregate cv:',cv)
        print('Aggregate score for {} range:'.format(rnge),model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train.join(components).iloc[irange],y=train.iloc[irange].deal_probability))
        
        
        if sum([score < 0 for score in cv]) > 0:
            print('Reached corruption.\n Final decomposition without joining...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Minutes:',(time.time()-t)/60)
            break
        
        # Join if it wasn't corrupted..    
        reduced_train = reduced_train.join(components)
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,2)]
        reduced_test = reduced_test.join(components)
        print(reduced_train.shape,reduced_test.shape)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Interval decompositions
        if up_col%(col_step*10) == 0:
            print('Interval decomposition...')
            print(reduced_train.shape,reduced_test.shape)
            reduce = cross_decomposition.PLSRegression(n_components=2)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            print('Interval save to disk...')
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# Final round of decomposition
print('Final decomposition...')
reduce = cross_decomposition.PLSRegression(n_components=2)
reduce.fit(reduced_train,train.deal_probability)
reduced_train = pd.DataFrame(
    reduce.transform(reduced_train),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
reduced_test = pd.DataFrame(
    reduce.transform(reduced_test),
    columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,2)])
print('Aggregate cv after decomposition:',model_selection.cross_val_score(
    cv=4,estimator=linear_model.LinearRegression(),
    X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)

Columns: 0-1500
Prelim score for column range: 0.11858489496328795
Aggregate cv: [0.10933183 0.10539125 0.10812998 0.109473  ]
Aggregate score for up range: [0.05540884 0.05498032 0.05727032 0.05326966]
(1503424, 2) (508438, 2)
Columns: 1500-3000
Prelim score for column range: 0.02710887262936812
Aggregate cv: [0.1210894  0.11648385 0.11984331 0.12060689]
Aggregate score for up range: [0.06668404 0.06648953 0.06898359 0.06483578]
(1503424, 4) (508438, 4)
Columns: 3000-4500
Prelim score for column range: 0.01726824714411901
Aggregate cv: [0.1264256  0.12211052 0.12501362 0.12600558]
Aggregate score for up range: [0.07270501 0.07218492 0.07495111 0.07031932]
(1503424, 6) (508438, 6)
Columns: 4500-6000
Prelim score for column range: 0.013728873916658002
Aggregate cv: [0.1291012  0.12516947 0.12816621 0.12895467]
Aggregate score for up range: [0.07515278 0.07436282 0.07774894 0.07348382]
(1503424, 8) (508438, 8)
Columns: 6000-7500
Prelim score for column range: 0.012114271209859973
Aggrega

---
> RESTART THE KERNEL
---