In [32]:
import pandas as pd
from scipy.sparse import hstack
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error, roc_auc_score
from utils.make_submission import write_to_file

from matplotlib import pyplot
import nltk
import string
import xgboost

In [33]:
df = pd.read_csv("data/training/data_train_normalize.csv",encoding='utf-8')
rmse = lambda y_pred, y_true: np.sqrt(mean_squared_error(y_pred,y_val))
scaler = MinMaxScaler()

In [3]:
df_train_origin = pd.read_csv("data/training/dat_norm_with_labels.csv",encoding='utf-8')
df_val_origin = pd.read_csv("data/validation/data_valid_strip_html.csv",encoding='utf-8')

# Old feature

In [4]:
tfIdfVecTitle = TfidfVectorizer(norm='l2',use_idf=False)
tfIdfVecDesc = TfidfVectorizer(norm='l2',use_idf=False)
# ngram = (1,2) for conciseness

In [5]:
title2vec = tfIdfVecTitle.fit_transform(df['title'])
desc2vec = tfIdfVecDesc.fit_transform(df['short_desc_strip'].fillna(' '))

In [40]:
df_onehot = pd.get_dummies(df[['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type']]
                           ,columns=['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type'])

df_real = scaler.fit_transform(df[['price','len_title','len_desc']])

In [41]:
total_df = hstack([title2vec,desc2vec,df_onehot,df_real])

# additional features: lexical, anatomical 

In [8]:
# count characters
chars_count = lambda sens: len(sens)
# ARI = 4.71 * (chars/words) + 0.5 * (words/sen) - 21.43
ARI = lambda sens:np.ceil(4.71 * (len(sens)/len(nltk.word_tokenize(sens))) + 0.5 * (len(nltk.word_tokenize(sens))/len(nltk.sent_tokenize(sens))) - 21.43)
# Number of sentences
sen_count = lambda sens: len(set(nltk.sent_tokenize(sens)))
# Number of punctuation
punc_count = lambda sens: len([word for word in nltk.wordpunct_tokenize(sens) 
                               if word in string.punctuation])
uppercase_count = lambda sens: len(set([word for word in nltk.wordpunct_tokenize(sens) 
                               if word.isupper()]))
title_count = lambda sens: len([word for word in nltk.wordpunct_tokenize(sens) 
                               if word.istitle()])

# jaccard_unigram = lambda title,desc: 1.0  * len(title.intersection(desc)) / (len(title.union(desc)))
def jaccard_unigram(title,desc):
    a = set(nltk.word_tokenize(title))
    b = set(nltk.word_tokenize(desc))
    return 1.0  * len(a.intersection(b)) / (len(a.union(b)))
# punc_count = lambda sens: 
def log2df(df):
    return np.log(df)
def sqrt2df(df):
    return np.sqrt(df)
def square2df(df):
    return np.square(df)

In [42]:
def make_lexical_features(df,df_origin):
    df_real_addtional = df[['title','short_desc_strip']].copy()
    df_real_addtional.fillna('aa',inplace=True)
    df_origin.fillna("aa",inplace=True)
    
    df_real_addtional['title_upper_count'] = df_origin['title'].map(uppercase_count)
    df_real_addtional['title_count'] = df_origin['title'].map(title_count)
    df_real_addtional['punc_count'] = df_origin['title'].map(punc_count)
    
    df_real_addtional['desc_sens'] = df_origin['short_desc_strip'].map(sen_count)
    df_real_addtional['desc_upper'] = df_origin['short_desc_strip'].map(uppercase_count)
    df_real_addtional['desc_title'] = df_origin['short_desc_strip'].map(title_count)
    df_real_addtional['desc_punc'] = df_origin['short_desc_strip'].map(punc_count)
    
    df_real_addtional['title_char_count'] = df_real_addtional['title'].map(chars_count)
    df_real_addtional['title_ari'] = df_real_addtional['title'].map(ARI)
    df_real_addtional['desc_char_count'] = df_real_addtional['short_desc_strip'].fillna('aa').map(chars_count)
    df_real_addtional['desc_ari'] = df_real_addtional['short_desc_strip'].fillna('aa').map(ARI)

    jaccard_title_desc = df_real_addtional[['title','short_desc_strip']].apply(
        lambda row: jaccard_unigram(row[0],row[1]),axis=1)

    df_real_addtional['jaccard'] = jaccard_title_desc
    df_real_addtional.drop(['title','short_desc_strip'],axis=1,inplace=True)
    return pd.DataFrame(scaler.fit_transform(df_real_addtional))
#     return df_real_addtional

In [43]:
def make_numeric_transform(df_real_addtional):
    df_real_log = df_real_addtional.apply(lambda d: np.log(d+100))
    df_real_sqrt = df_real_addtional.apply(lambda d: np.sqrt(np.abs(d)))
    df_real_square = df_real_addtional.apply(lambda d: np.square(d))
    return np.hstack([df_real_log,df_real_sqrt,df_real_square])

In [44]:
df_real_addtional = make_lexical_features(df,df_train_origin)

In [45]:
df_real_transform = make_numeric_transform(df_real_addtional)

In [46]:
#  With text features
total_df = hstack([title2vec,desc2vec,df_onehot,df_real,df_real_addtional,df_real_transform])
# Without text features
# total_df = hstack([desc2vec,df_onehot,df_real,df_real_addtional,df_real_transform])

# Submission to vec

In [66]:
test_df = pd.read_csv("data/validation/data_valid_normalize.csv")

In [67]:
title2vec_test = tfIdfVecTitle.transform(test_df['title'])
desc2vec_test = tfIdfVecDesc.transform(test_df['short_desc_strip'].fillna(' '))
df_onehot_test = pd.get_dummies(test_df[['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type']]
                           ,columns=['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type'])

df_real_test = test_df[['price','len_title','len_desc']].copy()

df_real_addtional_test = test_df[['title','short_desc_strip']].copy()
df_real_addtional_test.fillna('aa',inplace=True)

df_real_addtional_test = make_lexical_features(test_df,df_val_origin)
df_numeric_transform_test = make_numeric_transform(df_real_addtional_test)
# df_real_addtional_test['title_char_count'] = df_real_addtional_test['title'].map(chars_count)
# df_real_addtional_test['title_ari'] = df_real_addtional_test['title'].map(ARI)
# df_real_addtional_test['desc_char_count'] = df_real_addtional_test['short_desc_strip'].fillna('aa').map(chars_count)
# df_real_addtional_test['desc_ari'] = df_real_addtional_test['short_desc_strip'].fillna('aa').map(ARI)

# jaccard_title_desc = df_real_addtional_test[['title','short_desc_strip']].apply(
#     lambda row: jaccard_unigram(row[0],row[1]),axis=1)

# df_real_addtional_test['jaccard'] = jaccard_title_desc
# df_real_addtional_test.drop(['title','short_desc_strip'],axis=1,inplace=True)


In [68]:
test_df = hstack([title2vec_test,desc2vec_test,df_onehot_test,df_real_test,
                  df_real_addtional_test,df_numeric_transform_test])

# Clarity prediction

In [69]:
X_train, X_val, y_train, y_val = train_test_split(total_df,df['clarity'],stratify=df['clarity'],random_state=41)

## Linear model

In [70]:
lrEstimator = LogisticRegression()

lrEstimator.fit(X_train,y_train)

y_pred = lrEstimator.predict_proba(X_val)[:,1]

In [71]:
print np.sqrt(mean_squared_error(y_pred,y_val))

0.214078929494


In [72]:
y_sub = lrEstimator.predict_proba(test_df)[:,1]

In [15]:
write_to_file(y_sub,"submissions/linear_model/clarity_valid.predict")

Write successfully 11838 rows to submissions/linear_model/clarity_valid.predict


## SVM Online

In [121]:
clf = linear_model.SGDClassifier(loss='log',penalty='elasticnet',
                                 n_jobs=-1,learning_rate='optimal',verbose=1,n_iter=600)

In [122]:
clf.fit(X_train,y_train)

-- Epoch 1
Norm: 97.77, NNZs: 3233, Bias: 0.058715, T: 27212, Avg. loss: 13.945240
Total training time: 0.10 seconds.
-- Epoch 2
Norm: 64.26, NNZs: 1838, Bias: 0.056851, T: 54424, Avg. loss: 8.385625
Total training time: 0.14 seconds.
-- Epoch 3
Norm: 51.73, NNZs: 1671, Bias: 0.058308, T: 81636, Avg. loss: 6.125288
Total training time: 0.18 seconds.
-- Epoch 4
Norm: 43.85, NNZs: 1597, Bias: 0.058163, T: 108848, Avg. loss: 4.884767
Total training time: 0.23 seconds.
-- Epoch 5
Norm: 39.20, NNZs: 1547, Bias: 0.058688, T: 136060, Avg. loss: 4.083092
Total training time: 0.27 seconds.
-- Epoch 6
Norm: 35.90, NNZs: 1508, Bias: 0.058262, T: 163272, Avg. loss: 3.520754
Total training time: 0.31 seconds.
-- Epoch 7
Norm: 33.25, NNZs: 1474, Bias: 0.057986, T: 190484, Avg. loss: 3.099925
Total training time: 0.35 seconds.
-- Epoch 8
Norm: 31.21, NNZs: 1461, Bias: 0.058072, T: 217696, Avg. loss: 2.774762
Total training time: 0.39 seconds.
-- Epoch 9
Norm: 29.67, NNZs: 1448, Bias: 0.058161, T: 244

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=600, n_jobs=-1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=1, warm_start=False)

In [124]:
y_pred = clf.predict_proba(X_val)[:,1]
print np.sqrt(mean_squared_error(y_pred,y_val))
# 0.213881556625
# 0.213531188137
# 0.213353558436

0.213661072141


In [105]:
y_sub = clf.predict_proba(test_df)[:,1]

In [106]:
write_to_file(y_sub,"submissions/linear_model/clarity_valid.predict")

Write successfully 11838 rows to submissions/linear_model/clarity_valid.predict


## XGBoost

In [115]:
model = xgboost.XGBClassifier(n_estimators=600,max_depth=12,subsample=0.85,max_delta_step=10)
model.fit(X_train,y_train,eval_metric=['auc','error','logloss'],early_stopping_rounds=10,
          eval_set=[(X_train,y_train),(X_val,y_val)],verbose=1)

[0]	validation_0-auc:0.856838	validation_0-error:0.045531	validation_0-logloss:0.612798	validation_1-auc:0.820846	validation_1-error:0.065153	validation_1-logloss:0.615185
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-auc:0.876823	validation_0-error:0.041893	validation_0-logloss:0.546215	validation_1-auc:0.829407	validation_1-error:0.06383	validation_1-logloss:0.551863
[2]	validation_0-auc:0.882743	validation_0-error:0.04182	validation_0-logloss:0.491025	validation_1-auc:0.83117	validation_1-error:0.061074	validation_1-logloss:0.499097
[3]	validation_0-auc:0.891293	validation_0-error:0.042151	validation_0-logloss:0.44455	validation_1-auc:0.8344	validation_1-error:0.060523	validation_1-logloss:0.454615
[4]	validation_0-auc:0.900096	validation_0-error:0.042261	validation_0-logloss:0.404454	validation_1-auc:0.832923	validation_1-error:0.058759	validation_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=10, max_depth=12,
       min_child_weight=1, missing=None, n_estimators=600, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.85)

In [116]:
y_pred = model.predict_proba(X_val)[:,1]

print rmse(y_pred,y_val)
# 0.21444359428240292
# 0.222556260039
# 0.236795091056
# 0.215118146634

0.215956627602


In [114]:
%matplotlib
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [117]:
# plot classification error
%matplotlib
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [157]:
# Make submission
y_sub = model.predict_proba(test_df)[:,1]
write_to_file(y_sub,"submissions/xgboost/clarity_valid.predict")

Write successfully 11838 rows to submissions/xgboost/clarity_valid.predict


# Conciseness prediction

In [129]:
X_train, X_val, y_train, y_val = train_test_split(total_df,df['concise'],stratify=df['concise'],random_state=41)

## Linear model

In [130]:
lrEstimator = LogisticRegression(C=10)

lrEstimator.fit(X_train,y_train)

y_pred = lrEstimator.predict_proba(X_val)[:,1]

In [131]:
print np.sqrt(mean_squared_error(y_pred,y_val))

0.367288828485


In [161]:
# Make submission
y_sub = lrEstimator.predict_proba(test_df)[:,1]
write_to_file(y_sub,"submissions/linear_model/conciseness_valid.predict")

Write successfully 11838 rows to submissions/linear_model/conciseness_valid.predict


## Online SVM

In [137]:
clf_concise = linear_model.SGDClassifier(loss='log',penalty='elasticnet',
                                 n_jobs=-1,learning_rate='optimal',verbose=1,n_iter=1000)

In [138]:
clf_concise.fit(X_train,y_train)

-- Epoch 1
Norm: 177.52, NNZs: 9019, Bias: 0.050454, T: 27212, Avg. loss: 51.957923
Total training time: 0.07 seconds.
-- Epoch 2
Norm: 122.59, NNZs: 4609, Bias: 0.049186, T: 54424, Avg. loss: 30.358594
Total training time: 0.12 seconds.
-- Epoch 3
Norm: 98.45, NNZs: 4316, Bias: 0.050850, T: 81636, Avg. loss: 21.878476
Total training time: 0.17 seconds.
-- Epoch 4
Norm: 84.73, NNZs: 4147, Bias: 0.050003, T: 108848, Avg. loss: 17.252883
Total training time: 0.21 seconds.
-- Epoch 5
Norm: 75.59, NNZs: 4029, Bias: 0.050123, T: 136060, Avg. loss: 14.307756
Total training time: 0.26 seconds.
-- Epoch 6
Norm: 68.95, NNZs: 3940, Bias: 0.050556, T: 163272, Avg. loss: 12.261620
Total training time: 0.29 seconds.
-- Epoch 7
Norm: 64.18, NNZs: 3881, Bias: 0.050619, T: 190484, Avg. loss: 10.751700
Total training time: 0.33 seconds.
-- Epoch 8
Norm: 60.00, NNZs: 3815, Bias: 0.049761, T: 217696, Avg. loss: 9.591395
Total training time: 0.37 seconds.
-- Epoch 9
Norm: 56.85, NNZs: 3764, Bias: 0.050220

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=1000, n_jobs=-1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=1, warm_start=False)

In [139]:
y_pred = clf_concise.predict_proba(X_val)[:,1]
print np.sqrt(mean_squared_error(y_pred,y_val))
# 0.35681419787

0.35681419787


## Xgboost

In [147]:
model = xgboost.XGBClassifier(n_estimators=5400,max_depth=10,subsample=0.85)

model.fit(X_train,y_train,eval_metric=['auc','error','logloss'],early_stopping_rounds=10,
          eval_set=[(X_train,y_train),(X_val,y_val)],verbose=1)

[0]	validation_0-auc:0.864875	validation_0-error:0.171285	validation_0-logloss:0.645802	validation_1-auc:0.824646	validation_1-error:0.217396	validation_1-logloss:0.6513
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-auc:0.881364	validation_0-error:0.161069	validation_0-logloss:0.606123	validation_1-auc:0.840653	validation_1-error:0.202624	validation_1-logloss:0.616129
[2]	validation_0-auc:0.890664	validation_0-error:0.1528	validation_0-logloss:0.57253	validation_1-auc:0.844941	validation_1-error:0.198765	validation_1-logloss:0.587459
[3]	validation_0-auc:0.896553	validation_0-error:0.150375	validation_0-logloss:0.543626	validation_1-auc:0.84914	validation_1-error:0.199317	validation_1-logloss:0.563415
[4]	validation_0-auc:0.899956	validation_0-error:0.147729	validation_0-logloss:0.519371	validation_1-auc:0.851532	validation_1-error:0.197112	validation_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=5400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.85)

In [148]:
y_pred = model.predict_proba(X_val)[:,1]

print rmse(y_pred,y_val)
# 0.372570824696
# 0.362435264187
# 0.360522252066
# 0.351260649532

0.349238692027


In [149]:
%matplotlib
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [97]:
# plot classification error
%matplotlib
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [150]:
# Make submission
y_sub = model.predict_proba(test_df)[:,1]
write_to_file(y_sub,"submissions/xgboost/conciseness_valid.predict")

Write successfully 11838 rows to submissions/xgboost/conciseness_valid.predict
