In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.externals import joblib
from sklearn import preprocessing, model_selection, metrics, feature_selection, ensemble, linear_model, impute
%matplotlib inline

In [2]:
train = pd.read_csv('../../../train.csv.zip',compression='zip',parse_dates=['activation_date'])
test = pd.read_csv('../../../test.csv.zip',compression='zip',parse_dates=['activation_date'])

In [79]:
train_othfeat = pd.DataFrame(index=train.index)
test_othfeat = pd.DataFrame(index=test.index)

# Day of Week Dummies

In [80]:
# Add day features to train
new_features = pd.DataFrame(index=train.index)
new_features = new_features.join(pd.get_dummies(train["activation_date"].dt.weekday))
new_features.columns =['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
train_othfeat = train_othfeat.join(new_features)

In [81]:
# Add day features to test
new_features = pd.DataFrame(index=test.index)
new_features = new_features.join(pd.get_dummies(test["activation_date"].dt.weekday))
new_features.columns =['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
test_othfeat = test_othfeat.join(new_features)

# Missing Values Indicators

In [84]:
indicator = impute.MissingIndicator(missing_values=True,error_on_new=False)

In [87]:
both = train.isna().append(test.isna()).reset_index(drop=True).drop('deal_probability',axis=1)

In [88]:
indicator.fit(both)

MissingIndicator(error_on_new=False, features='missing-only',
         missing_values=True, sparse='auto')

In [89]:
#train
columns = both.iloc[:,indicator.features_].columns
new_features = pd.DataFrame(indicator.transform(train.drop('deal_probability',axis=1).isna()),columns = columns)
new_features = new_features.add_prefix('miss_').astype(int)
train_othfeat = train_othfeat.join(new_features)
# test
new_features = pd.DataFrame(indicator.transform(test.isna()),columns = columns)
new_features = new_features.add_prefix('miss_').astype(int)
test_othfeat = test_othfeat.join(new_features)

# Word Lengths

In [91]:
new_features = pd.DataFrame(index=train.index)
new_features['desc_words'] = train.description.fillna('').apply(lambda x: len(x.split()))
train_othfeat = train_othfeat.join(new_features)

In [92]:
new_features = pd.DataFrame(index=test.index)
new_features['desc_words'] = test.description.fillna('').apply(lambda x: len(x.split()))
test_othfeat = test_othfeat.join(new_features)

In [94]:
new_features = pd.DataFrame(index=train.index)
new_features['title_words'] = train.title.fillna('').apply(lambda x: len(x.split()))
train_othfeat = train_othfeat.join(new_features)

In [95]:
new_features = pd.DataFrame(index=test.index)
new_features['title_words'] = test.title.fillna('').apply(lambda x: len(x.split()))
test_othfeat = test_othfeat.join(new_features)

# Imputations

In [96]:
imputer = impute.SimpleImputer(strategy='median')
imputer.fit(train[['price','image_top_1']].append(test[['price','image_top_1']]))
new_features = pd.DataFrame(imputer.transform(train[['price','image_top_1']]))
new_features.columns = ['price_imp','image_top_1_imp']
train_othfeat = train_othfeat.join(new_features)

In [97]:
new_features = pd.DataFrame(imputer.transform(test[['price','image_top_1']]))
new_features.columns = ['price_imp','image_top_1_imp']
test_othfeat = test_othfeat.join(new_features)

In [98]:
train_othfeat['seq_n'] = train['item_seq_number']
test_othfeat['seq_n'] = test['item_seq_number']

In [106]:
joblib.dump(train_othfeat,'train_othfeat.sav')
joblib.dump(test_othfeat,'test_othfeat.sav')

['test_othfeat.sav']

---
>
---