In [None]:
import numpy as np, pandas as pd, os, gc, tensorflow as tf, random
np.random.seed(32)
os.environ["PYTHONASHSEED"] = "64"
random.seed(128)
session_conf = tf.ConfigProto(intra_op_parallelism_threads = 6, inter_op_parallelism_threads = 5)
from keras import backend as K
tf.set_random_seed(256)
K.set_session(tf.Session(graph = tf.get_default_graph(), config = session_conf))

In [None]:
train = pd.read_csv("../input/donorschoose-application-screening/train.csv")
test = pd.read_csv("../input/donorschoose-application-screening/test.csv")
resources = pd.read_csv("../input/donorschoose-application-screening/resources.csv")
train = train.sort_values(by = "project_submitted_datetime")
# EMBEDDING_FILE = "../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec"
EMBEDDING_FILE = "../input/glove840b300dtxt/glove.840B.300d.txt"
embed_size = 300
y_train = train["project_is_approved"]

submission = pd.DataFrame()
submission["id"] = test[["id"]]

In [None]:
train_teachers = list(set(train.teacher_id.values))
test_teachers = list(set(test.teacher_id.values))
inter = set(train_teachers).intersection(test_teachers)

In [None]:
char_cols = ["project_subject_categories", "project_subject_categories", "project_title",
            "project_essay_1", "project_essay_2", "project_essay_3", "project_essay_4",
            "project_resource_summary"]
cat_features = ["teacher_prefix", "school_state", "year", "month", 
                "project_grade_category", "project_subject_categories", "project_subject_subcategories"]
num_features = ["teacher_number_of_previously_posted_projects", "total_price_x", "total_price_y", "total_price"]

In [None]:
resources["total_price"] = resources.quantity*resources.price
mean_total_price = pd.DataFrame(resources.groupby("id").total_price.mean())
sum_total_price = pd.DataFrame(resources.groupby("id").total_price.sum())
count_total_price = pd.DataFrame(resources.groupby("id").total_price.count())
mean_total_price["id"] = mean_total_price.index
sum_total_price["id"] = mean_total_price.index
count_total_price["id"] = mean_total_price.index

In [None]:
def create_features(df):
    df = pd.merge(df, mean_total_price, on = "id")
    df = pd.merge(df, sum_total_price, on = "id")
    df = pd.merge(df, count_total_price, on = "id")
    df["year"] = df.project_submitted_datetime.apply(lambda x: x.split("-")[0])
    df["month"] = df.project_submitted_datetime.apply(lambda x: x.split("-")[1])
    for col in char_cols:
        df[col] = df[col].fillna(" ")
    df["text"] = df.apply(lambda x: " ".join(x[col] for col in char_cols), axis=1)
    return df

train = create_features(train)
test = create_features(test)

In [None]:
cat_features_hash = [col + "_hash" for col in cat_features]

max_size = 15000
def feature_hash(df, max_size = max_size):
    for col in cat_features:
        df[col+"_hash"] = df[col].apply(lambda x: hash(x)%max_size)
    return df

train = feature_hash(train)
test = feature_hash(test)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(train, y_train, test_size = 0.1, random_state = 32)

del train, y_train; gc.collect()

In [None]:
from sklearn.preprocessing import StandardScaler
#from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing import text, sequence
import re

max_features = 100000#50000
maxlen = 300
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_features])
X_valid_num = scaler.transform(X_valid[num_features])
X_test_num = scaler.transform(test[num_features])

X_train_cat = np.array(X_train[cat_features_hash], dtype = np.int)
X_valid_cat = np.array(X_valid[cat_features_hash], dtype = np.int)
X_test_cat = np.array(test[cat_features_hash], dtype = np.int)
tokenizer = text.Tokenizer(num_words = max_features)

In [None]:
def preprocess1(string):
    '''
    :param string:
    :return:
    '''
    string = re.sub(r'(\")', ' ', string)
    string = re.sub(r'(\r)', ' ', string)
    string = re.sub(r'(\n)', ' ', string)
    string = re.sub(r'(\r\n)', ' ', string)
    string = re.sub(r'(\\)', ' ', string)
    string = re.sub(r'\t', ' ', string)
    string = re.sub(r'\:', ' ', string)
    string = re.sub(r'\"\"\"\"', ' ', string)
    string = re.sub(r'_', ' ', string)
    string = re.sub(r'\+', ' ', string)
    string = re.sub(r'\=', ' ', string)

    return string.strip().lower()

X_train["text"] = X_train["text"].apply(preprocess1)
X_valid["text"] = X_valid["text"].apply(preprocess1)
test["text"] = test["text"].apply(preprocess1)

tokenizer.fit_on_texts(X_train["text"].tolist())
list_tokenized_train = tokenizer.texts_to_sequences(X_train["text"].tolist())
list_tokenized_valid = tokenizer.texts_to_sequences(X_valid["text"].tolist())
list_tokenized_test = tokenizer.texts_to_sequences(test["text"].tolist())
X_train_words = sequence.pad_sequences(list_tokenized_train, maxlen = maxlen)
X_valid_words = sequence.pad_sequences(list_tokenized_valid, maxlen = maxlen)
X_test_words = sequence.pad_sequences(list_tokenized_test, maxlen = maxlen)

del list_tokenized_train, list_tokenized_valid, list_tokenized_test, test; gc.collect()

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)+1)
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Input, Dense, Embedding, Flatten, concatenate, Dropout, Conv1D, Lambda
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, GRU, LSTM
from keras.models import Model, load_model
from keras.optimizers import Adam, RMSprop

def build_model(units = 64, cnn_filters = 64, dr = 0.3, dense = 128, top_k = 5):
    
    def _top_k(x):
        x = tf.transpose(x, [0, 2, 1])
        k_max = tf.nn.top_k(x, k=top_k)
        return tf.reshape(k_max[0], (-1, 2 * units * top_k))
    
    cat_input = Input(shape = (len(cat_features_hash), ))
    num_input = Input(shape = (len(num_features), ))
    word_input = Input(shape = (maxlen, ))
    
    cat_emb = Embedding(max_size, 10)(cat_input)
    x_cat = SpatialDropout1D(dr)(cat_emb)
    x_cat = Flatten()(x_cat)
    
    word_emb = Embedding(nb_words, embed_size, weights = [embedding_matrix],
                        input_length = maxlen, trainable = False)(word_input)
    x_word = SpatialDropout1D(dr)(word_emb)
    x_word = Bidirectional(LSTM(units, return_sequences = True))(x_word)
    x_word = Bidirectional(GRU(units, return_sequences = True))(x_word)
#     x_word = Conv1D(cnn_filters, kernel_size = 3, activation = "relu")(x_word)
#     x_word = GlobalMaxPooling1D()(x_word)
    
    k_max = Lambda(_top_k)(x_word)
    avg_pool = GlobalAveragePooling1D()(x_word)
    x_word = concatenate([k_max, avg_pool])
    
    x_cat = Dense(dense, activation = "relu")(x_cat)
    x_num = Dense(dense, activation = "relu")(num_input)
    
    out_put = concatenate([x_cat, x_num, x_word])
#     out_put = Dropout(dr)(Dense(dense//2, activation = "relu")(out_put))
    out_put = Dense(1, activation = "sigmoid")(out_put)
    model = Model(inputs = [cat_input, num_input, word_input], outputs = out_put)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = 1e-3), metrics = ["accuracy"])
    return model

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
model = build_model()
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", mode = "min",
                              save_best_only = True, verbose = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3, verbose = 1)

model.fit([X_train_cat, X_train_num, X_train_words], Y_train, epochs = , batch_size = 256,
          validation_data = ([X_valid_cat, X_valid_num, X_valid_words], Y_valid),
          verbose = 1, callbacks = [check_point, early_stop])

model = load_model(file_path, custom_objects = {"tf": tf})
prediction = model.predict([X_test_cat, X_test_num, X_test_words], batch_size = 2048, verbose = 1)

In [None]:
submission["project_is_approved"] = prediction
submission.to_csv("submission.csv", index = False)

In [None]:
submission.head()