In [1]:
import pandas as pd
from scipy.sparse import hstack
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import mean_squared_error, roc_auc_score
from utils.make_submission import write_to_file

from matplotlib import pyplot
import nltk
import string
import xgboost



In [2]:
df = pd.read_csv("data/training/data_train_normalize.csv",encoding='utf-8')
rmse = lambda y_pred, y_true: np.sqrt(mean_squared_error(y_pred,y_val))
scaler = MinMaxScaler()

In [3]:
df_train_origin = pd.read_csv("data/training/dat_norm_with_labels.csv",encoding='utf-8')
df_val_origin = pd.read_csv("data/validation/data_valid_strip_html.csv",encoding='utf-8')

# Old feature

In [4]:
tfIdfVecTitle = TfidfVectorizer(norm='l2',use_idf=False)
tfIdfVecDesc = TfidfVectorizer(norm='l2',use_idf=False)
# ngram = (1,2) for conciseness

In [5]:
title2vec = tfIdfVecTitle.fit_transform(df['title'])
desc2vec = tfIdfVecDesc.fit_transform(df['short_desc_strip'].fillna(' '))

In [6]:
df_onehot = pd.get_dummies(df[['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type']]
                           ,columns=['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type'])

df_real = scaler.fit_transform(df[['price','len_title','len_desc']])

In [7]:
total_df = hstack([title2vec,desc2vec,df_onehot,df_real])

# additional features: lexical, anatomical 

In [8]:
# count characters
chars_count = lambda sens: len(sens)
# ARI = 4.71 * (chars/words) + 0.5 * (words/sen) - 21.43
ARI = lambda sens:np.ceil(4.71 * (len(sens)/len(nltk.word_tokenize(sens))) + 0.5 * (len(nltk.word_tokenize(sens))/len(nltk.sent_tokenize(sens))) - 21.43)
# Number of sentences
sen_count = lambda sens: len(set(nltk.sent_tokenize(sens)))
# Number of punctuation
punc_count = lambda sens: len([word for word in nltk.wordpunct_tokenize(sens) 
                               if word in string.punctuation])
uppercase_count = lambda sens: len(set([word for word in nltk.wordpunct_tokenize(sens) 
                               if word.isupper()]))
title_count = lambda sens: len([word for word in nltk.wordpunct_tokenize(sens) 
                               if word.istitle()])

# jaccard_unigram = lambda title,desc: 1.0  * len(title.intersection(desc)) / (len(title.union(desc)))
def jaccard_unigram(title,desc):
    a = set(nltk.word_tokenize(title))
    b = set(nltk.word_tokenize(desc))
    return 1.0  * len(a.intersection(b)) / (len(a.union(b)))
# punc_count = lambda sens: 
def log2df(df):
    return np.log(df)
def sqrt2df(df):
    return np.sqrt(df)
def square2df(df):
    return np.square(df)

In [9]:
def make_lexical_features(df,df_origin):
    df_real_addtional = df[['title','short_desc_strip']].copy()
    df_real_addtional.fillna('aa',inplace=True)
    df_origin.fillna("aa",inplace=True)
    
    df_real_addtional['title_upper_count'] = df_origin['title'].map(uppercase_count)
    df_real_addtional['title_count'] = df_origin['title'].map(title_count)
    df_real_addtional['punc_count'] = df_origin['title'].map(punc_count)
    
    df_real_addtional['desc_sens'] = df_origin['short_desc_strip'].map(sen_count)
    df_real_addtional['desc_upper'] = df_origin['short_desc_strip'].map(uppercase_count)
    df_real_addtional['desc_title'] = df_origin['short_desc_strip'].map(title_count)
    df_real_addtional['desc_punc'] = df_origin['short_desc_strip'].map(punc_count)
    
    df_real_addtional['title_char_count'] = df_real_addtional['title'].map(chars_count)
    df_real_addtional['title_ari'] = df_real_addtional['title'].map(ARI)
    df_real_addtional['desc_char_count'] = df_real_addtional['short_desc_strip'].fillna('aa').map(chars_count)
    df_real_addtional['desc_ari'] = df_real_addtional['short_desc_strip'].fillna('aa').map(ARI)

    jaccard_title_desc = df_real_addtional[['title','short_desc_strip']].apply(
        lambda row: jaccard_unigram(row[0],row[1]),axis=1)

    df_real_addtional['jaccard'] = jaccard_title_desc
    df_real_addtional.drop(['title','short_desc_strip'],axis=1,inplace=True)
    return pd.DataFrame(scaler.fit_transform(df_real_addtional))
#     return df_real_addtional

In [10]:
def make_numeric_transform(df_real_addtional):
    df_real_log = df_real_addtional.apply(lambda d: np.log(d+100))
    df_real_sqrt = df_real_addtional.apply(lambda d: np.sqrt(np.abs(d)))
    df_real_square = df_real_addtional.apply(lambda d: np.square(d))
    return np.hstack([df_real_log,df_real_sqrt,df_real_square])

In [11]:
df_real_addtional = make_lexical_features(df,df_train_origin)

In [12]:
df_real_transform = make_numeric_transform(df_real_addtional)

In [13]:
#  With text features
total_df = hstack([title2vec,desc2vec,df_onehot,df_real,df_real_addtional,df_real_transform])
# Without text features
# total_df = hstack([desc2vec,df_onehot,df_real,df_real_addtional,df_real_transform])

# Submission to vec

In [14]:
test_df = pd.read_csv("data/validation/data_valid_normalize.csv")

In [15]:
title2vec_test = tfIdfVecTitle.transform(test_df['title'])
desc2vec_test = tfIdfVecDesc.transform(test_df['short_desc_strip'].fillna(' '))
df_onehot_test = pd.get_dummies(test_df[['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type']]
                           ,columns=['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type'])

df_real_test = test_df[['price','len_title','len_desc']].copy()

df_real_addtional_test = test_df[['title','short_desc_strip']].copy()
df_real_addtional_test.fillna('aa',inplace=True)

df_real_addtional_test = make_lexical_features(test_df,df_val_origin)
df_numeric_transform_test = make_numeric_transform(df_real_addtional_test)
# df_real_addtional_test['title_char_count'] = df_real_addtional_test['title'].map(chars_count)
# df_real_addtional_test['title_ari'] = df_real_addtional_test['title'].map(ARI)
# df_real_addtional_test['desc_char_count'] = df_real_addtional_test['short_desc_strip'].fillna('aa').map(chars_count)
# df_real_addtional_test['desc_ari'] = df_real_addtional_test['short_desc_strip'].fillna('aa').map(ARI)

# jaccard_title_desc = df_real_addtional_test[['title','short_desc_strip']].apply(
#     lambda row: jaccard_unigram(row[0],row[1]),axis=1)

# df_real_addtional_test['jaccard'] = jaccard_title_desc
# df_real_addtional_test.drop(['title','short_desc_strip'],axis=1,inplace=True)


In [16]:
test_df = hstack([title2vec_test,desc2vec_test,df_onehot_test,df_real_test,
                  df_real_addtional_test,df_numeric_transform_test])

# Clarity prediction

In [17]:
X_train, X_val, y_train, y_val = train_test_split(total_df,df['clarity'],stratify=df['clarity'],random_state=41)

## Linear model

In [22]:
lrEstimator = LogisticRegression()

lrEstimator.fit(X_train,y_train)

y_pred = lrEstimator.predict_proba(X_val)[:,1]

In [23]:
print np.sqrt(mean_squared_error(y_pred,y_val))

0.213704368962


In [24]:
y_sub = lrEstimator.predict_proba(test_df)[:,1]

In [15]:
write_to_file(y_sub,"submissions/linear_model/clarity_valid.predict")

Write successfully 11838 rows to submissions/linear_model/clarity_valid.predict


## XGBoost

In [25]:
model = xgboost.XGBClassifier(n_estimators=400,max_depth=5,subsample=0.85)
model.fit(X_train,y_train,eval_metric=['auc','error','logloss'],early_stopping_rounds=10,
          eval_set=[(X_train,y_train),(X_val,y_val)],verbose=1)

[0]	validation_0-auc:0.814018	validation_0-error:0.053947	validation_0-logloss:0.61493	validation_1-auc:0.797058	validation_1-error:0.057877	validation_1-logloss:0.615355
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-auc:0.822285	validation_0-error:0.054682	validation_0-logloss:0.550978	validation_1-auc:0.799061	validation_1-error:0.057656	validation_1-logloss:0.55224
[2]	validation_0-auc:0.826969	validation_0-error:0.054131	validation_0-logloss:0.497836	validation_1-auc:0.80277	validation_1-error:0.057656	validation_1-logloss:0.499621
[3]	validation_0-auc:0.8297	validation_0-error:0.054241	validation_0-logloss:0.45301	validation_1-auc:0.806115	validation_1-error:0.056995	validation_1-logloss:0.455199
[4]	validation_0-auc:0.829828	validation_0-error:0.0538	validation_0-logloss:0.414931	validation_1-auc:0.805948	validation_1-error:0.056885	validation_1-

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.85)

In [26]:
y_pred = model.predict_proba(X_val)[:,1]

print rmse(y_pred,y_val)
# 0.21444359428240292
# 0.222556260039
# 0.236795091056
# 0.215118146634
# 0.212934842573

0.213379908604


In [27]:
%matplotlib
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [26]:
# plot classification error
%matplotlib
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [25]:
# Make submission
y_sub = model.predict_proba(test_df)[:,1]
write_to_file(y_sub,"submissions/xgboost/clarity_valid.predict")

Write successfully 11838 rows to submissions/xgboost/clarity_valid.predict


# Conciseness prediction

In [38]:
X_train, X_val, y_train, y_val = train_test_split(total_df,df['concise'],stratify=df['concise'],random_state=41)

## Linear model

In [44]:
lrEstimator = LogisticRegression(C=1)

lrEstimator.fit(X_train,y_train)

y_pred = lrEstimator.predict_proba(X_val)[:,1]

In [45]:
print np.sqrt(mean_squared_error(y_pred,y_val))

0.342389399858


In [47]:
# Make submission
y_sub = lrEstimator.predict_proba(test_df)[:,1]
write_to_file(y_sub,"submissions/linear_model/conciseness_valid.predict")

Write successfully 11838 rows to submissions/linear_model/conciseness_valid.predict


## Xgboost

In [57]:
model = xgboost.XGBClassifier(n_estimators=700,max_depth=7,subsample=0.85)

model.fit(X_train,y_train,eval_metric=['auc','error','logloss'],early_stopping_rounds=10,
          eval_set=[(X_train,y_train),(X_val,y_val)],verbose=1)

[0]	validation_0-auc:0.854296	validation_0-error:0.186021	validation_0-logloss:0.649022	validation_1-auc:0.83937	validation_1-error:0.198655	validation_1-logloss:0.650559
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-auc:0.860828	validation_0-error:0.17959	validation_0-logloss:0.612687	validation_1-auc:0.844663	validation_1-error:0.193915	validation_1-logloss:0.615994
[2]	validation_0-auc:0.867657	validation_0-error:0.176613	validation_0-logloss:0.581849	validation_1-auc:0.849655	validation_1-error:0.195458	validation_1-logloss:0.587368
[3]	validation_0-auc:0.872885	validation_0-error:0.173416	validation_0-logloss:0.555504	validation_1-auc:0.854304	validation_1-error:0.193363	validation_1-logloss:0.562993
[4]	validation_0-auc:0.879672	validation_0-error:0.170146	validation_0-logloss:0.532464	validation_1-auc:0.860472	validation_1-error:0.188623	validat

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.85)

In [58]:
y_pred = model.predict_proba(X_val)[:,1]

print rmse(y_pred,y_val)
# 0.372570824696
# 0.362435264187
# 0.360522252066
# 0.351260649532
# 0.338447260392
# 0.336433629337

0.336795568045


In [59]:
%matplotlib
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [51]:
# plot classification error
%matplotlib
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification Error')
pyplot.title('XGBoost Classification Error')
pyplot.show()

Using matplotlib backend: Qt4Agg


In [52]:
# Make submission
y_sub = model.predict_proba(test_df)[:,1]
write_to_file(y_sub,"submissions/xgboost/conciseness_valid.predict")

Write successfully 11838 rows to submissions/xgboost/conciseness_valid.predict
