In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import nltk
import cyrtranslit
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, cross_decomposition, feature_extraction, decomposition
from sklearn.pipeline import Pipeline
from scipy import stats

import lightgbm as lgb
import time
color = sns.color_palette()

%matplotlib inline

In [2]:
train = pd.read_pickle('../../train.pkl',compression='zip')

test = pd.read_pickle('../../test.pkl',compression='zip')

In [3]:
ru_stop = nltk.corpus.stopwords.words('russian')

In [4]:
i_0 = train[train.deal_probability==0].index.tolist()
i_low = train[(train.deal_probability>0)&(train.deal_probability<0.65)].index.tolist()
i_up = train[train.deal_probability>=0.65].index.tolist()

---

In [35]:
sumsdf = pd.DataFrame()

In [5]:
vocabs = pd.read_pickle('vocabs_count.pkl')

# Frequencies of Upper Vocabulary

In [6]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.up_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

16262


In [7]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['upvoc'] = [i[0] for i in sums]

# Frequencies of Lower Vocabulary

In [42]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.low_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

41218


In [43]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['lowvoc'] = [i[0] for i in sums]

# Frequencies of Zero Vocabulary

In [44]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.zero_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

197377


In [45]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['zerovoc'] = [i[0] for i in sums]

In [46]:
sumsdf

Unnamed: 0,upvoc,lowvoc,zerovoc
0,0.0,0.000000,1.713905
1,0.0,0.000000,1.399713
2,0.0,1.000000,1.000000
3,0.0,0.000000,1.000000
4,0.0,1.718149,0.000000
5,0.0,0.000000,1.414151
6,0.0,0.000000,2.209076
7,0.0,0.000000,1.414052
8,0.0,0.000000,1.000000
9,0.0,0.000000,2.212945


In [68]:
sumsdf.to_pickle('sums_title_idf.pkl',compression='zip')

In [47]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability))

Aggregate score: [0.12570183 0.12225722 0.12447048 0.12691567]


In [48]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05907549 0.05921495 0.05917784 0.05931771]


---





---

## Aggregate with Components of Other Ranges

In [55]:
features = pd.read_pickle('../3.NLP StopWords NGrams/train_nlp_features7.pkl',compression='zip')

In [50]:
fzero_counts = pd.read_pickle('zero_titles_counts.pkl',compression='zip')

In [51]:
flow_counts = pd.read_pickle('lower_titles_count.pkl',compression='zip')

In [53]:
fup_counts = pd.read_pickle('upper_titles_counts.pkl',compression='zip')

In [52]:
categorical = pd.read_pickle('categorical.pkl',compression='zip')

In [56]:
features = features.join(fzero_counts).join(flow_counts).join(fup_counts).join(categorical)

In [58]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.2323735  0.22674674 0.23056228 0.23010993]


In [59]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05186779 0.0521658  0.052007   0.05230665]


In [60]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.22822534533027566


In [61]:
features = features.join(sumsdf)

In [62]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.25905851 0.25332965 0.25733987 0.25902115]


In [63]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05006471 0.05037244 0.05019708 0.05034241]


In [64]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.22415208804343606


In [65]:
features.columns

Index(['title_desc_0', 'title_desc_1', 'title_desc_2', 'title_desc_3',
       'title_desc_4', 'title_desc_5', 'title_desc_6', 'title_desc_7',
       'title_desc_8', 'title_desc_9', 'zero_titles0', 'zero_titles1',
       'zero_titles2', 'zero_titles3', 'zero_titles4', 'zero_titles5',
       'zero_titles6', 'zero_titles7', 'zero_titles8', 'zero_titles9',
       'lower_titles0', 'lower_titles1', 'lower_titles2', 'lower_titles3',
       'lower_titles4', 'upper_titles0', 'upper_titles1', 'upper_titles2',
       'upper_titles3', 'upper_titles4', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'upvoc', 'lowvoc',
       'zerovoc'],
      dtype='object')

In [66]:
features.to_pickle('allfeatures.pkl',compression='zip')

---

# Compare with Counts instead of IDF

In [69]:
sumsdf = pd.DataFrame()

In [70]:
vocabs = pd.read_pickle('vocabs_count.pkl')

# Counts of Upper Vocabulary

In [71]:
vec = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.up_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

16262


In [72]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['upvoc'] = [i[0] for i in sums]

# Counts of Lower Vocabulary

In [73]:
vec = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.low_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

41218


In [74]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['lowvoc'] = [i[0] for i in sums]

# Counts of Zero Vocabulary

In [78]:
vec = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.zero_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

197377


In [79]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['zerovoc'] = [i[0] for i in sums]

In [80]:
sumsdf

Unnamed: 0,upvoc,lowvoc,zerovoc
0,0,0,3
1,0,0,2
2,0,1,1
3,0,0,1
4,0,3,0
5,0,0,2
6,0,0,5
7,0,0,2
8,0,0,1
9,0,0,5


In [81]:
sumsdf.to_pickle('sums_title_counts.pkl',compression='zip')

In [82]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability))

Aggregate score: [0.11156719 0.10877588 0.1110338  0.11331196]


In [83]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.06003055 0.06012444 0.06008604 0.06024195]


---





---

## Aggregate with Components of Other Ranges

In [84]:
features = pd.read_pickle('../3.NLP StopWords NGrams/train_nlp_features7.pkl',compression='zip')

In [85]:
fzero_counts = pd.read_pickle('zero_titles_counts.pkl',compression='zip')

In [86]:
flow_counts = pd.read_pickle('lower_titles_count.pkl',compression='zip')

In [87]:
fup_counts = pd.read_pickle('upper_titles_counts.pkl',compression='zip')

In [88]:
categorical = pd.read_pickle('categorical.pkl',compression='zip')

In [89]:
features = features.join(fzero_counts).join(flow_counts).join(fup_counts).join(categorical)

In [90]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.2323735  0.22674674 0.23056228 0.23010993]


In [91]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05186779 0.0521658  0.052007   0.05230665]


In [92]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.22822534533027566


In [93]:
features = features.join(sumsdf)

In [94]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.25620338 0.25057115 0.25474513 0.25611915]


In [95]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05025762 0.05055854 0.05037246 0.05053957]


In [96]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.22457080899323661


- IDFs produced higher scores here.

In [97]:
features = pd.read_pickle('allfeatures.pkl',compression='zip')

In [99]:
features = features.join(sumsdf,rsuffix='counts')

In [100]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.25908394 0.25339694 0.2574272  0.25910756]


In [101]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05006299 0.0503679  0.05019117 0.05033654]


In [102]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.22414203312793293


- So using both is better. And these features are easy to make.

In [103]:
features.columns

Index(['title_desc_0', 'title_desc_1', 'title_desc_2', 'title_desc_3',
       'title_desc_4', 'title_desc_5', 'title_desc_6', 'title_desc_7',
       'title_desc_8', 'title_desc_9', 'zero_titles0', 'zero_titles1',
       'zero_titles2', 'zero_titles3', 'zero_titles4', 'zero_titles5',
       'zero_titles6', 'zero_titles7', 'zero_titles8', 'zero_titles9',
       'lower_titles0', 'lower_titles1', 'lower_titles2', 'lower_titles3',
       'lower_titles4', 'upper_titles0', 'upper_titles1', 'upper_titles2',
       'upper_titles3', 'upper_titles4', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'upvoc', 'lowvoc',
       'zerovoc', 'upvoccounts', 'lowvoccounts', 'zerovoccounts'],
      dtype='object')

features.to_pickle('allfeatures.pkl',compression='zip')

# Use IDF Vocabs

In [105]:
sumsdf = pd.DataFrame()

In [106]:
vocabs = pd.read_pickle('vocabs.pkl')

# IDF of Upper-(IDF)-Vocabulary

In [109]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.up_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

46459


In [110]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['upvoc'] = [i[0] for i in sums]

# IDF of Lower(IDF)Vocabulary

In [111]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.low_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

41434


In [112]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['lowvoc'] = [i[0] for i in sums]

# IDF of Zero(IDF)Vocabulary

In [113]:
vec = feature_extraction.text.TfidfVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.zero_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

166964


In [114]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['zerovoc'] = [i[0] for i in sums]

In [115]:
sumsdf

Unnamed: 0,upvoc,lowvoc,zerovoc
0,0.000000,1.000000,1.413954
1,0.000000,0.000000,1.399713
2,1.000000,1.000000,0.000000
3,1.000000,0.000000,0.000000
4,1.718149,0.000000,0.000000
5,1.414151,0.000000,0.000000
6,1.000000,0.000000,1.973227
7,0.000000,0.000000,1.414052
8,0.000000,0.000000,1.000000
9,0.000000,0.000000,2.212945


sumsdf.to_pickle('sums_title_idf_idfvoc.pkl',compression='zip')

In [117]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability))

Aggregate score: [0.16157764 0.15594306 0.16065807 0.16078198]


In [118]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.0566514  0.05694241 0.05673189 0.05701682]


- This is better than using the counts vocabularies.

---





---

## Aggregate with Components of Other Ranges

In [119]:
features = pd.read_pickle('allfeatures.pkl',compression='zip')

In [120]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.25908394 0.25339694 0.2574272  0.25910756]


In [121]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05006299 0.0503679  0.05019117 0.05033654]


In [122]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.22414203312793293


In [124]:
features = features.join(sumsdf,rsuffix='_idfvoc')

In [125]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.26521013 0.25889282 0.26402115 0.26526325]


In [126]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.04964905 0.04999714 0.04974548 0.04991832]


In [127]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.2232207338415103


features = pd.read_pickle('allfeatures.pkl',compression='zip')

# Counts of IDF Vocabs

In [129]:
sumsdf = pd.DataFrame()

In [130]:
vocabs = pd.read_pickle('vocabs.pkl')

# Counts of Upper-(IDF)-Vocabulary

In [131]:
vec = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.up_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

46459


In [132]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['upvoc'] = [i[0] for i in sums]

# Counts of Lower(IDF)Vocabulary

In [133]:
vec = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.low_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

41434


In [134]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['lowvoc'] = [i[0] for i in sums]

# Counts of Zero(IDF)Vocabulary

In [138]:
vec = feature_extraction.text.CountVectorizer(
    stop_words=ru_stop,
    lowercase=False,
    #max_features=8600,
    #ngram_range=(1,2),
    #min_df=0.0005,
    #max_df=0.0005,
    vocabulary=vocabs.zero_voc.dropna()
)
vec.fit(train['title'].astype(str).tolist()+test['title'].astype(str).tolist())
print(len(vec.get_feature_names()))

166964


In [139]:
# Word counts for train. CSR Matrix, tokens ordered alphabetically
counts = vec.transform(train['title'].astype(str).tolist())

sums = counts.sum(axis=1)

sums = sums.tolist()

sumsdf['zerovoc'] = [i[0] for i in sums]

In [140]:
sumsdf

Unnamed: 0,upvoc,lowvoc,zerovoc
0,0,1,2
1,0,0,2
2,1,1,0
3,1,0,0
4,3,0,0
5,2,0,0
6,1,0,4
7,0,0,2
8,0,0,1
9,0,0,5


In [141]:
sumsdf.to_pickle('sums_title_counts_idfvoc.pkl',compression='zip')

In [142]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability))

Aggregate score: [0.15167771 0.14698102 0.15032688 0.15130829]


In [143]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=sumsdf,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.05732032 0.05754702 0.05743018 0.05766046]


---





---

## Aggregate with Components of Other Ranges

In [156]:
features = pd.read_pickle('../3.NLP StopWords NGrams/train_nlp_features7.pkl',compression='zip')

In [157]:
fzero_counts = pd.read_pickle('zero_titles_counts.pkl',compression='zip')

In [158]:
flow_counts = pd.read_pickle('lower_titles_count.pkl',compression='zip')

In [159]:
fup_counts = pd.read_pickle('upper_titles_counts.pkl',compression='zip')

In [160]:
categorical = pd.read_pickle('categorical.pkl',compression='zip')

In [161]:
features = features.join(fzero_counts).join(flow_counts).join(fup_counts).join(categorical)

In [162]:
sums_idf = pd.read_pickle('sums_title_idf.pkl',compression='zip')

In [163]:
sums_count = pd.read_pickle('sums_title_counts.pkl',compression='zip')

In [164]:
sums_idf_idfvoc = pd.read_pickle('sums_title_idf_idfvoc.pkl',compression='zip')

In [165]:
sums_count_idfvoc = pd.read_pickle('sums_title_counts_idfvoc.pkl',compression='zip')

In [171]:
features = features.join(sums_count).join(sums_idf,rsuffix='_idf').join(sums_idf_idfvoc,rsuffix='_idf_idfvoc').join(sums_count_idfvoc,rsuffix='_count_idfvoc')

In [172]:
features.columns

Index(['title_desc_0', 'title_desc_1', 'title_desc_2', 'title_desc_3',
       'title_desc_4', 'title_desc_5', 'title_desc_6', 'title_desc_7',
       'title_desc_8', 'title_desc_9', 'zero_titles0', 'zero_titles1',
       'zero_titles2', 'zero_titles3', 'zero_titles4', 'zero_titles5',
       'zero_titles6', 'zero_titles7', 'zero_titles8', 'zero_titles9',
       'lower_titles0', 'lower_titles1', 'lower_titles2', 'lower_titles3',
       'lower_titles4', 'upper_titles0', 'upper_titles1', 'upper_titles2',
       'upper_titles3', 'upper_titles4', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'upvoc', 'lowvoc',
       'zerovoc', 'upvoc_idf', 'lowvoc_idf', 'zerovoc_idf', 'upvoc_idf_idfvoc',
       'lowvoc_idf_idfvoc', 'zerovoc_idf_idfvoc', 'upvoc_count_idfvoc',
       'lowvoc_count_idfvoc', 'zerovoc_count_idfvoc'],
      dtype='object')

features.to_pickle('allfeatures.pkl',compression='zip')

features = pd.read_pickle('allfeatures.pkl',compression='zip')

In [175]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability))

Aggregate score: [0.2652255  0.25892289 0.26406247 0.26528173]


In [176]:
print('Aggregate score:',model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error)
))

Aggregate score: [0.04964801 0.04999511 0.04974269 0.04991706]


In [177]:
cv = model_selection.cross_val_score(
            cv=4,estimator=linear_model.LinearRegression(),
            X=features,y=train.deal_probability,
            scoring=metrics.make_scorer(metrics.mean_squared_error))
print(np.mean(cv)**.5)

0.2232167489114207
