In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding

from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

import datetime as dt
import pandas as pd
import numpy as np
import warnings
import string

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [1]:
stop_words = list(set(stopwords.words('english')))
warnings.filterwarnings('ignore')
punctuation = string.punctuation

id_column = "id"
missing_token = " UNK "

train = pd.read_csv("Data/train.csv", parse_dates=["project_submitted_datetime"])
test = pd.read_csv("Data/test.csv", parse_dates=["project_submitted_datetime"])
rc = pd.read_csv("Data/resources.csv").fillna(missing_token)

df = pd.concat([train, test], axis=0) 

rc['total_price'] = rc['quantity']*rc['price']
agg_rc = rc.groupby('id').agg({'description':'count', 'quantity':'sum', 'price':'sum', 'total_price':'sum'}).rename(columns={'description':'items'})

for func in ['min', 'max', 'mean','std']:
    agg_rc_temp = rc.groupby('id').agg({'quantity':func, 'price':func, 'total_price':func}).rename(columns={'quantity':func+'_quantity', 'price':func+'_price', 'total_price':func+'_total_price'}).fillna(0)
    agg_rc = agg_rc.join(agg_rc_temp)

agg_rc = agg_rc.join(rc.groupby('id').agg({'description':lambda x:' '.join(x.values.astype(str))}).rename(columns={'description':'resource_description'}))

df = df.join(agg_rc, on='id')
#df.head(100)

# extracting datetime features using datetime module 
df["Year"] = df["project_submitted_datetime"].dt.year
df["Month"] = df["project_submitted_datetime"].dt.month
df['Weekday'] = df['project_submitted_datetime'].dt.weekday
df["Hour"] = df["project_submitted_datetime"].dt.hour
df["Month_Day"] = df['project_submitted_datetime'].dt.day
df["Year_Day"] = df['project_submitted_datetime'].dt.dayofyear

#df[['Year', 'Month', 'Weekday', 'Hour', 'Month_Day', 'Year_Day']].head(10)

# fillup empty values with missing token 
df['project_essay_3'] = df['project_essay_3'].fillna(missing_token)
df['project_essay_4'] = df['project_essay_4'].fillna(missing_token)

# extract length of each essay and title
df["essay1_len"] = df['project_essay_1'].apply(len)
df["essay2_len"] = df['project_essay_2'].apply(len)
df["essay3_len"] = df['project_essay_3'].apply(len)
df["essay4_len"] = df['project_essay_4'].apply(len)
df["title_len"] = df['project_title'].apply(len)
df['resource_summary_len'] = df['project_resource_summary'].apply(len)
df['resource_description_len'] = df['resource_description'].apply(len)

df['resource_description_wc'] = df['resource_description'].apply(lambda x: len(str(x).split(' ')))
df['title_wc'] = df['project_title'].apply(lambda x: len(str(x).split(' ')))
df['essay1_wc'] = df['project_essay_1'].apply(lambda x: len(str(x).split(' ')))
df['essay2_wc'] = df['project_essay_2'].apply(lambda x: len(str(x).split(' ')))
df['essay3_wc'] = df['project_essay_3'].apply(lambda x: len(str(x).split(' ')))
df['essay4_wc'] = df['project_essay_4'].apply(lambda x: len(str(x).split(' ')))
df['resource_summary_wc'] = df['project_resource_summary'].apply(lambda x: len(str(x).split(' ')))

#df[['essay_1_wc', 'essay_2_wc', 'essay_3_wc', 'essay_4_wc', 'title_wc','resource_summary_wc']].head(10)

# combine the project essays to create a complete essay text
df['text'] = df.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                            str(row['project_essay_2']), 
                                            str(row['project_essay_3']), 
                                            str(row['project_essay_4'])]), axis=1)

# extract features from text
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
df['title_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
df['stopword_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))

#df[['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count', 'stopword_count']].head(10)

# functions to get polatiy and subjectivity of text using the module textblob
def get_polarity(text):
    try:
        textblob = TextBlob(text)
        pol = textblob.sentiment.polarity
    except:
        pol = 0.0
    return pol

def get_subjectivity(text):
    try:
        textblob = TextBlob(text)
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj


# change df_small to df to create these features on complete dataframe
print('start evaluating popularity')
df['polarity'] = df['text'].apply(get_polarity)
df['subjectivity'] = df['text'].apply(get_subjectivity)
print('end evaluating popularity')
#df[['polarity', 'subjectivity']].head(10)

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except:
        pass
    return cnt

# change df_small to df in all of the following lines to create features on complete data frame
print('start evaluating pos_dic')
df['noun_count'] = df['text'].apply(lambda x: pos_check(x, 'noun'))
print(1)
df['verb_count'] = df['text'].apply(lambda x: pos_check(x, 'verb'))
print(2)
df['adj_count'] = df['text'].apply(lambda x: pos_check(x, 'adj'))
print(3)
df['adv_count'] = df['text'].apply(lambda x: pos_check(x, 'adv'))
print(4)
df['pron_count'] = df['text'].apply(lambda x: pos_check(x, 'pron'))
print(5)
df.to_csv('Data/df_NLPed.csv',index = False)

import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
print('Label Encoder...')
cols = [
    'teacher_id', 
    'teacher_prefix', 
    'school_state', 
    'project_grade_category', 
    'project_subject_categories', 
    'project_subject_subcategories'
]

for c in tqdm(cols):
    le = LabelEncoder()
    le.fit(df[c].astype(str))
    df[c] = le.transform(df[c].astype(str))
    print(df[c].isna().any())
del le
gc.collect()
print('Done.')

print('Preprocessing text...')
cols = [
    'project_title', 
    'text', 
    'project_resource_summary',
    'resource_description'
]
    
n_features = [
    100, 
    1000, 
    100,
    100
]

df_idf = pd.DataFrame()
for c_i, c in tqdm(enumerate(cols)):
    tfidf = TfidfVectorizer(
        max_features=n_features[c_i],
        norm='l2',
        )
    tfidf.fit(df[c])
    tfidf_df = np.array(tfidf.transform(df[c]).toarray(), dtype=np.float16)
    
    for i in range(n_features[c_i]):
        df_idf[c + '_tfidf_' + str(i)] = tfidf_df[:, i]
print(tfidf_df.shape)
#del df_all
import gc
gc.collect()
df_idf.to_csv('Data/df_idf.csv',index = False)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


start evaluating popularity
end evaluating popularity
start evaluating pos_dic
1
2
3
4
5


  0%|          | 0/6 [00:00<?, ?it/s]

Label Encoder...


 17%|█▋        | 1/6 [00:02<00:12,  2.40s/it]

False


 33%|███▎      | 2/6 [00:02<00:05,  1.46s/it]

False


 50%|█████     | 3/6 [00:03<00:03,  1.17s/it]

False


 67%|██████▋   | 4/6 [00:04<00:02,  1.02s/it]

False


 83%|████████▎ | 5/6 [00:04<00:00,  1.06it/s]

False


100%|██████████| 6/6 [00:05<00:00,  1.11it/s]

False





NameError: name 'gc' is not defined

In [4]:
print('Preprocessing text...')
cols = [
    'project_title', 
    'text', 
    'project_resource_summary',
    'resource_description'
]
    
n_features = [
    100, 
    1000, 
    100,
    100
]

df_idf = pd.DataFrame()
for c_i, c in tqdm(enumerate(cols)):
    df[c] = df[c].astype(str)
    tfidf = TfidfVectorizer(
        max_features=n_features[c_i],
        norm='l2',
        )
    tfidf.fit(df[c])
    tfidf_df = np.array(tfidf.transform(df[c]).toarray(), dtype=np.float16)
    
    for i in range(n_features[c_i]):
        df_idf[c + '_tfidf_' + str(i)] = tfidf_df[:, i]
print(tfidf_df.shape)
#del df_all
import gc
gc.collect()
df_idf.to_csv('Data/df_idf_2.csv',index = False)

0it [00:00, ?it/s]

Preprocessing text...


4it [02:45, 41.29s/it]


(317920, 100)


In [83]:
df = pd.read_csv('Data/df_NLPed.csv')
df_idf = pd.read_csv('Data/df_idf_2.csv')

df_idf.index = range(0,len(df_idf))
df.index = range(0,len(df))
import gc
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder


In [86]:
print('Label Encoder...')
cols = [
    'teacher_id', 
    'teacher_prefix', 
    'school_state', 
    'project_grade_category', 
    'project_subject_categories', 
    'project_subject_subcategories'
]

for c in tqdm(cols):
    le = LabelEncoder()
    le.fit(df[c].astype(str))
    df[c] = le.transform(df[c].astype(str))
    print(df[c].isna().any())
del le
gc.collect()
print('Done.')

data = pd.concat((df,df_idf),axis=1)
data.shape + df.shape + df_idf.shape


final_test = data[data.project_is_approved.isnull()]
final_test = final_test.drop_duplicates('id')
test = test.drop(columns=['teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects'])
final_test = pd.merge(test,final_test,on = ['id'], how = 'left')
train = data[data.project_is_approved.notnull()]

#del data
#gc.collect()

to_drop = ['id','project_essay_1','project_essay_2','project_essay_3','project_essay_4',
           'project_resource_summary','project_submitted_datetime','project_title','resource_description',
          'text','project_is_approved']

final_test = final_test.drop(columns=to_drop)
x = train.drop(columns=to_drop)
y = train.project_is_approved

  0%|          | 0/6 [00:00<?, ?it/s]

Label Encoder...


 17%|█▋        | 1/6 [00:03<00:19,  3.82s/it]

False


 33%|███▎      | 2/6 [00:04<00:09,  2.45s/it]

False


 50%|█████     | 3/6 [00:06<00:06,  2.03s/it]

False


 67%|██████▋   | 4/6 [00:07<00:03,  1.78s/it]

False


 83%|████████▎ | 5/6 [00:08<00:01,  1.67s/it]

False


100%|██████████| 6/6 [00:09<00:00,  1.60s/it]

False





Done.


In [143]:
x = train.drop(columns=to_drop)
y = train.project_is_approved

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [147]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
params = {
        'boosting_type': 'dart',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 10,
        'learning_rate': 0.05,
        'feature_fraction': 0.25,
        'bagging_fraction': 0.85,
        'seed': 0,
        'verbose': 0,
        }
skf = StratifiedKFold(n_splits=5)
p_buf = []
for train_index, val_index in tqdm(skf.split(x, y)):
    x_train, x_test = x.iloc[train_index], x.iloc[val_index]
    y_train, y_test = y[train_index], y[val_index]
    model = lgb.train(
                params,
                lgb.Dataset(x_train, y_train),
                num_boost_round=10000,
                valid_sets=[lgb.Dataset(x_test, y_test)],
                early_stopping_rounds=200,
                verbose_eval=200)
    p = model.predict(final_test, num_iteration=model.best_iteration)
    if len(p_buf) == 0:
        p_buf = np.array(p, dtype=np.float16)
    else:
        p_buf += np.array(p, dtype=np.float16)


0it [00:00, ?it/s][A
Exception in thread Thread-15:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/healer/src/tqdm/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Training until validation scores don't improve for 200 rounds.
[200]	valid_0's auc: 0.744844
[400]	valid_0's auc: 0.758327
[600]	valid_0's auc: 0.765869
[800]	valid_0's auc: 0.770957
[1000]	valid_0's auc: 0.775192
[1200]	valid_0's auc: 0.777246
[1400]	valid_0's auc: 0.778957
[1600]	valid_0's auc: 0.779077
[1800]	valid_0's auc: 0.779658
[2000]	valid_0's auc: 0.780452
[2200]	valid_0's auc: 0.781094
[2400]	valid_0's auc: 0.780824
Early stopping, best iteration is:
[2208]	valid_0's auc: 0.781152


1it [29:03, 1743.23s/it]

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's auc: 0.744155
[400]	valid_0's auc: 0.757485
[600]	valid_0's auc: 0.765087
[800]	valid_0's auc: 0.771507
[1000]	valid_0's auc: 0.775316
[1200]	valid_0's auc: 0.777158
[1400]	valid_0's auc: 0.778707
[1600]	valid_0's auc: 0.779001
[1800]	valid_0's auc: 0.779758
[2000]	valid_0's auc: 0.780181
[2200]	valid_0's auc: 0.780692
Early stopping, best iteration is:
[2187]	valid_0's auc: 0.780708


2it [57:04, 1712.35s/it]

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's auc: 0.684961
[400]	valid_0's auc: 0.69658
[600]	valid_0's auc: 0.721621
[800]	valid_0's auc: 0.725249
Early stopping, best iteration is:
[713]	valid_0's auc: 0.728929


3it [1:06:12, 1324.05s/it]

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's auc: 0.979123
Early stopping, best iteration is:
[28]	valid_0's auc: 0.980704


4it [1:08:32, 1028.05s/it]

Training until validation scores don't improve for 200 rounds.
[200]	valid_0's auc: 0.638229
Early stopping, best iteration is:
[1]	valid_0's auc: 0.741756


5it [1:10:46, 849.25s/it] 


In [124]:
importance = model.feature_importance()
model_fnames = model.feature_name()
tuples = np.array(sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1])

In [131]:
drop = tuples[145:,0]

In [49]:
from sklearn.metrics import roc_curve
pred_lgb = model.predict(x_test)
fi = model.feature_importance()

In [61]:
test.columns

Index(['id', 'teacher_id', 'teacher_prefix', 'school_state',
       'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects'],
      dtype='object')

In [88]:
final_test.shape

(78035, 1357)

In [152]:
(p_buf/5).min()

0.3232

In [153]:
s = model.predict(final_test, num_iteration=model.best_iteration)
d = {'id':test.id , 'project_is_approved':p_buf/5}
submission = pd.DataFrame(data = d)
submission.to_csv('0419.csv',index = False)

In [145]:
tuples[0:145]

array([['teacher_number_of_previously_posted_projects', '165'],
       ['items', '161'],
       ['std_price', '134'],
       ['noun_count', '103'],
       ['essay2_wc', '99'],
       ['text_tfidf_118', '88'],
       ['text_tfidf_551', '61'],
       ['project_title_tfidf_10', '59'],
       ['text_tfidf_981', '56'],
       ['price', '49'],
       ['resource_description_tfidf_85', '45'],
       ['project_resource_summary_tfidf_8', '43'],
       ['resource_description_tfidf_7', '42'],
       ['text_tfidf_87', '42'],
       ['text_tfidf_652', '35'],
       ['resource_description_wc', '32'],
       ['resource_description_tfidf_14', '31'],
       ['text_tfidf_939', '30'],
       ['text_tfidf_976', '28'],
       ['text_tfidf_661', '24'],
       ['essay2_len', '21'],
       ['resource_description_len', '20'],
       ['essay3_len', '19'],
       ['resource_description_tfidf_60', '18'],
       ['resource_description_tfidf_19', '18'],
       ['project_resource_summary_tfidf_51', '18'],
       ['te

In [5]:
from sklearn.metrics import roc_curve
from tqdm import tqdm
skf = StratifiedKFold(n_splits=2)
train_index, val_index = skf.split(x, y)



In [None]:
from sklearn.metrics import roc_curve
from tqdm import tqdm
skf = StratifiedKFold(n_splits=2)
for train_index, val_index in tqdm(skf.split(x, y)):
    x_train, x_val = x.iloc[train_index], x.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    feature_importance, pred_lgb = gbm(x_train, y_train, x_val, y_val)
    roc_curve(y_val, pred_lgb)
    