# Get Vocabularies for Target Ranges

- Include Count and IDF vocabularies.
- Get vocabs for titles and descriptions.

# Title Vocabs

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [2]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [3]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [4]:
# Make count and idf vocabularies for target ranges on a given variable
var = 'title'
#####################################
upper_str = ' '.join(train.loc[i_up,var].astype(str).values)
lower_str = ' '.join(train.loc[i_low,var].astype(str).values)
zeroes_str = ' '.join(train.loc[i_0,var].astype(str).values)

# Get dictionaries from both count and idf vectorizers.
count = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False)
idf = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False)

vecs ={'count':count,'idf':idf}
#####################
for key in vecs:
    vec = vecs[key]
    vec.fit([zeroes_str,lower_str,upper_str])
    counts = vec.transform([zeroes_str,lower_str,upper_str])

    # Convert CSR into DataFrame and Transpose. Now terms are on the index
    counts = pd.DataFrame(counts.toarray()).T

    # Extract terms from vocabulary, sort by index and add to df index
    vocab = vec.vocabulary_
    terms = [f for f in vocab]
    terms = pd.DataFrame(terms)
    terms['index'] = [vocab[k] for k in vocab]
    terms.sort_values(by='index',inplace=True)
    terms = terms[0].values.tolist()
    counts.index = terms
    
    # Make an indicator of where the highest frequency is for each term
    group = []
    for i in np.arange(len(counts)):
        group.append(np.argmax(counts.iloc[i].values))      
    counts['group'] = group
    
    zero_vocab = counts[counts.group == 0].sort_values(by=0,ascending=False).index.tolist()
    lower_vocab = counts[counts.group == 1].sort_values(by=1,ascending=False).index.tolist()
    upper_vocab = counts[counts.group == 2].sort_values(by=2,ascending=False).index.tolist()

    vocabs = [zero_vocab,lower_vocab,upper_vocab]
    vocabs = pd.DataFrame(vocabs)
    vocabs = vocabs.T
    vocabs.columns=['zero_voc','low_voc','up_voc']
    
    vocabs.to_pickle('{}_vocabs_{}.pkl'.format(var[:5],key))

---
> RESTART KERNEL
---

# Description Vocabs

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [8]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [9]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [11]:
# Make count and idf vocabularies for target ranges on a given variable
var = 'description'
#####################################
upper_str = ' '.join(train.loc[i_up,var].astype(str).values)
lower_str = ' '.join(train.loc[i_low,var].astype(str).values)
zeroes_str = ' '.join(train.loc[i_0,var].astype(str).values)

# Get dictionaries from both count and idf vectorizers.
count = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False)
idf = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False)

vecs ={'count':count,'idf':idf}
#####################
for key in vecs:
    vec = vecs[key]
    vec.fit([zeroes_str,lower_str,upper_str])
    counts = vec.transform([zeroes_str,lower_str,upper_str])

    # Convert CSR into DataFrame and Transpose. Now terms are on the index
    counts = pd.DataFrame(counts.toarray()).T

    # Extract terms from vocabulary, sort by index and add to df index
    vocab = vec.vocabulary_
    terms = [f for f in vocab]
    terms = pd.DataFrame(terms)
    terms['index'] = [vocab[k] for k in vocab]
    terms.sort_values(by='index',inplace=True)
    terms = terms[0].values.tolist()
    counts.index = terms
    
    # Make an indicator of where the highest frequency is for each term
    group = []
    for i in np.arange(len(counts)):
        group.append(np.argmax(counts.iloc[i].values))      
    counts['group'] = group
    
    zero_vocab = counts[counts.group == 0].sort_values(by=0,ascending=False).index.tolist()
    lower_vocab = counts[counts.group == 1].sort_values(by=1,ascending=False).index.tolist()
    upper_vocab = counts[counts.group == 2].sort_values(by=2,ascending=False).index.tolist()

    vocabs = [zero_vocab,lower_vocab,upper_vocab]
    vocabs = pd.DataFrame(vocabs)
    vocabs = vocabs.T
    vocabs.columns=['zero_voc','low_voc','up_voc']
    
    vocabs.to_pickle('{}_vocabs_{}.pkl'.format(var[:5],key))

---
> RESTART THE KERNEL
---

# Cross-Decomposition per Vocab

I can only decompose one vocabulary at a time, for memory limitation reasons. So every time define:
- the variable
- the vocabulary kind
- the target range it represents.

## Title var- Count voc- Zero Range

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
import time
from sklearn.externals import joblib
%matplotlib inline

In [18]:
train = pd.read_pickle('../../train.pkl',compression='zip')
test = pd.read_pickle('../../test.pkl',compression='zip')
# Russian stopwords
ru_stop = nltk.corpus.stopwords.words('russian')

In [18]:
# Define discrete target boundaries
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

In [43]:
# Define variable, index-range, discrete-range, voc-origin, and component-name
var = 'title' # title or description
irange = i_0 # pick from above cell
rnge = 'zero' # zero, low or up
voc_kind = 'count' # count or idf
compname = 'zero_cnt_voc' # custom name for component

In [58]:
# Read the desired vocabulary onto a list of limited length
voc = pd.read_pickle('{}_vocabs_{}.pkl'.format(var[:5],voc_kind))['{}_voc'.format(rnge)].dropna()[:67000]

In [59]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    vocabulary=voc)
# Fitting on train and test as merged lists
vec.fit(train[var].astype(str).tolist() + test[var].astype(str).tolist())
print('N tokens:',len(vec.get_feature_names()))

N tokens: 67000


In [60]:
#### vectors for train and test
counts_train = vec.transform(train[var].astype(str).tolist())
counts_test = vec.transform(test[var].astype(str).tolist())

In [61]:
#### To start from zero...
reduced_train = pd.DataFrame(index=train.index)
reduced_test = pd.DataFrame(index=test.index)

In [None]:
# Reduce all CSR values in batches
t = time.time()
start_col = 0
varname = var[:5]
##########################################
n_cols = counts_train.shape[1]
col_step = 1500
col_end = n_cols + col_step
##########################################
# Start iteration with columns
low_col = start_col
for col in np.arange(0,col_end,col_step):
    # Limiting the edge case of the last values
    if col > n_cols:
        col = n_cols
    up_col = col
    
    if up_col > low_col:
        ###########################################
        # Train PLSR on a large sample of train vectors
        print('Columns: {}-{}'.format(low_col,up_col))
        index = np.random.choice(len(train),size=int(4e5))
        sample = counts_train[index,low_col:up_col].toarray()
        reduce = cross_decomposition.PLSRegression(n_components=5)
        reduce.fit(sample,train.iloc[index].deal_probability)
        print('Prelim score for column range:',reduce.score(sample,train.iloc[index].deal_probability))
        ##########################################
        # (TRAIN) Nested indexes iteration
        # Initial values:
        n_rows = len(train)
        row_step = int(2.5e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            # Limiting the edge case of the last values
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_train[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,5)]
        reduced_train = reduced_train.join(components)
        print('Aggregate cv:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=reduced_train,y=train.deal_probability))
        ###########################################
        # (TEST) Nested indexes iteration
        # Initial values:
        n_rows = len(test)
        row_step = int(2e5)
        row_end = n_rows + row_step
        components = pd.DataFrame()
        low_idx = 0
        ###########
        for idx in np.arange(0,row_end,row_step):
            if idx > n_rows:
                idx = n_rows
            up_idx = idx

            if up_idx > low_idx:
                sample = counts_test[low_idx:up_idx,low_col:up_col].toarray()
                sample = reduce.transform(sample)
                components = components.append(pd.DataFrame(sample))
                low_idx = idx
        components.reset_index(drop=True,inplace=True)
        components.columns = ['col_{}-{}_{}'.format(low_col,up_col,i) for i in range(0,5)]
        reduced_test = reduced_test.join(components)
        #####################################
        # Prepare for next column range
        low_col = col     
        #####################################    
        # Decompose aggregate every n steps
        if up_col%(col_step*10) == 0:
            print('Decomposing Aggregate...')
            reduce = cross_decomposition.PLSRegression(n_components=5)
            reduce.fit(reduced_train,train.deal_probability)
            reduced_train = pd.DataFrame(
                reduce.transform(reduced_train),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
            reduced_test = pd.DataFrame(
                reduce.transform(reduced_test),
                columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
            print('Aggregate cv after decomposition:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced_train,y=train.deal_probability))
            print('Aggregate score for target range:',model_selection.cross_val_score(
                cv=4,estimator=linear_model.LinearRegression(),
                X=reduced.iloc[trange],y=train.iloc[trange].deal_probability))
        #####################################    
        # Save progress every n steps
        if up_col%(col_step*5) == 0:
            joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
            joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))   
#####################################
# One final round of decomposition to n components
if reduced_train.shape[1] > 5:
    print('Decomposing Aggregate...')
    reduce = cross_decomposition.PLSRegression(n_components=5)
    reduce.fit(reduced_train,train.deal_probability)
    reduced_train = pd.DataFrame(
        reduce.transform(reduced_train),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
    reduced_test = pd.DataFrame(
        reduce.transform(reduced_test),
        columns=['{}_{}_{}_{}'.format(varname,compname,i,up_col) for i in range(0,5)])
    print('Aggregate cv after decomposition:',model_selection.cross_val_score(
        cv=4,estimator=linear_model.LinearRegression(),
        X=reduced_train,y=train.deal_probability))
#########
# Last save to disk
joblib.dump(reduced_train,'train_{}_{}.sav'.format(varname,compname))
joblib.dump(reduced_test,'test_{}_{}.sav'.format(varname,compname))
#########
print('Minutes:',(time.time()-t)/60)