In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Data/Ultimate.csv')
train = pd.read_csv("Data/train.csv", parse_dates=["project_submitted_datetime"])
test = pd.read_csv("Data/test.csv", parse_dates=["project_submitted_datetime"])

#df.to_csv('Data/Ultimate.csv',index = False)

tfidf_para = {
    "sublinear_tf":True,
    "strip_accents":'unicode',
    "stop_words":"english",
    "analyzer":'word',
    "token_pattern":r'\w{1,}',
    #"ngram_range":(1,1),
    "dtype":np.float32,
    "norm":'l2',
    "min_df":5,
    "max_df":.9,
    "smooth_idf":False
}

# Thanks To
# https://www.kaggle.com/lopuhin/eli5-for-mercari
# https://www.kaggle.com/jagangupta/understanding-approval-donorschoose-eda-fe-eli5/notebook
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_col(col_name):
    return lambda x: x[col_name]

df["project_title_count"] = df["project_title"].copy()
textcols = ["text","project_resource_summary","project_title", "project_title_count","resource_description"]
vectorizer = FeatureUnion([
        ('text',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=20000,
            **tfidf_para,
            preprocessor=get_col('text'))),
        ('project_resource_summary',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            max_features=2000,
            preprocessor=get_col('project_resource_summary'))),
        ('project_title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            max_features=1500,
            preprocessor=get_col('project_title'))),
        ('project_title_count',CountVectorizer(
            ngram_range=(1, 2),
            max_features=1500,
            preprocessor=get_col('project_title_count'))),
        ('resource_description',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            max_features=2400,
            preprocessor=get_col('resource_description'))),
#         ('Non_text',DictVectorizer())
    ])

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
for c in textcols:
    df[c] = df[c].astype(str)
start_vect=time.time()
ready_df = vectorizer.fit_transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

from sklearn.preprocessing import LabelEncoder
import gc
print('Label Encoder...')
cols = [
    'teacher_id', 
    'teacher_prefix', 
    'school_state', 
    'project_grade_category', 
    'project_subject_categories', 
    'project_subject_subcategories'
]
from tqdm import tqdm
for c in tqdm(cols):
    le = LabelEncoder()
    le.fit(df[c].astype(str))
    df[c] = le.transform(df[c].astype(str))
    print(df[c].isna().any())
del le
gc.collect()

final_test = df[df.project_is_approved.isnull()]
traindex = list(set(df.index) - set(final_test.index))
testdex = list(set(df.index) - set(traindex))
to_drop = ['id','project_essay_1','project_essay_2','project_essay_3','project_essay_4',
           'project_resource_summary','project_submitted_datetime','project_title','resource_description',
          'text','project_is_approved',"project_title_count"]

final_test = final_test.drop(columns=to_drop)
x = df.loc[traindex,:].drop(columns=to_drop)
y = df.loc[traindex,'project_is_approved']

from scipy.sparse import hstack, csr_matrix
x = hstack([csr_matrix(x.values),ready_df[0:182080]])
final_test = hstack([csr_matrix(final_test.values),ready_df[182080:]])

x = x.tocsr()
final_test = final_test.tocsr()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  interactivity=interactivity, compiler=compiler, result=result)
  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

Vectorization Runtime: 12.27 Minutes
Label Encoder...
False


 17%|██████████████                                                                      | 1/6 [00:03<00:15,  3.08s/it]

False


 33%|████████████████████████████                                                        | 2/6 [00:03<00:07,  1.83s/it]

False


 50%|██████████████████████████████████████████                                          | 3/6 [00:04<00:04,  1.43s/it]

False


 67%|████████████████████████████████████████████████████████                            | 4/6 [00:04<00:02,  1.22s/it]

False


 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [00:05<00:01,  1.11s/it]

False


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00,  1.04s/it]


In [21]:
final_test = df[df.project_is_approved.isnull()]
final_test = final_test.drop(columns=to_drop)
x = df.loc[traindex,:].drop(columns=to_drop)
y = df.loc[traindex,'project_is_approved']
#x = x.fillna(0)
#final_test = final_test.fillna(0)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [20]:
from keras.layers import Input, Dense, Flatten, concatenate, Dropout, Embedding, SpatialDropout1D
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.models import Model
from keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def breakInput(X1):
    X2 = []
    i = 0
    for n in [n_es1, n_es2, n_prs, n_rd, n_pt, X_cat.shape[1], len(numFeatures)]:
        X2.append(X1[:,i:i+n])
        i += n
    return X2

def getModel(HLs, Drop=0.25, OP=optimizers.Adam()):
    temp = []
    inputs_txt = []
    for n in [n_es1, n_es2, n_prs, n_rd, n_pt]:
        input_txt = Input((n, ))
        X_feat = Dropout(Drop)(input_txt)
        X_feat = Dense(int(n/100), activation="linear")(X_feat)
        X_feat = Dropout(Drop)(X_feat)
        temp.append(X_feat)
        inputs_txt.append(input_txt)

    x_1 = concatenate(temp)
    x_1 = Dense(20, activation="relu")(x_1)
    x_1 = Dropout(Drop)(x_1)

    input_cat = Input((X_cat.shape[1], ))
#     x_2 = Dropout(Drop)(input_cat)
    x_2 = Embedding(2, 10, input_length=X_cat.shape[1])(input_cat)
    x_2 = SpatialDropout1D(Drop)(x_2)
    x_2 = Flatten()(x_2)

    input_num = Input((len(numFeatures), ))
    x_3 = Dropout(Drop)(input_num)
    
    x = concatenate([x_1, x_2, x_3])

    for HL in HLs:
        x = Dense(HL, activation="relu")(x)
        x = Dropout(Drop)(x)

    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs_txt+[input_cat, input_num], outputs=output)
    model.compile(
            optimizer=OP,
            loss='binary_crossentropy',
            metrics=['binary_accuracy'])
    return model

def trainNN(X_train, X_val, Tar_train, Tar_val, HL=[50], Drop=0.5, OP=optimizers.Adam()):
    file_path='NN.h5'
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=2, save_best_only=True, save_weights_only=True, mode='min')
    early = EarlyStopping(monitor="val_loss", mode="min", patience=6)
    lr_reduced = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.5,
                                   patience=2,
                                   verbose=1,
                                   epsilon=3e-4,
                                   mode='min')

    model = getModel(HL, Drop, OP)
    model.fit(breakInput(X_train), Tar_train, validation_data=(breakInput(X_val), Tar_val),
                        verbose=2, epochs=50, batch_size=1000, callbacks=[early, lr_reduced, checkpoint])
    model.load_weights(file_path)
    return model

Using TensorFlow backend.


In [27]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
def getCatFeatures(T, Col, Encoder='OneHot'):
    ohe = OneHotEncoder()
    le = LabelEncoder()
    if Encoder=='OneHot':
        X = ohe.fit_transform(le.fit_transform(T[Col].fillna('')).reshape((-1,1)))
    else:
        X = le.fit_transform(T[Col].fillna(''))
    return X

Encoder = 'OneHot'
X_tp = getCatFeatures(df, 'teacher_prefix', Encoder)
X_sc = getCatFeatures(df, 'school_state', Encoder)
X_pgc = getCatFeatures(df, 'project_grade_category', Encoder)
X_psc = getCatFeatures(df, 'project_subject_categories', Encoder)
X_pssc = getCatFeatures(df, 'project_subject_subcategories', Encoder)


if Encoder=='OneHot':
    X_cat = hstack((X_tp, X_sc, X_pgc, X_psc, X_pssc))
else:
    X_cat = pl.array((X_tp, X_sc, X_pgc, X_psc, X_pssc)).T

del X_tp, X_sc, X_pgc, X_psc, X_pssc

In [30]:
X_cat

<260115x528 sparse matrix of type '<class 'numpy.float64'>'
	with 1300575 stored elements in COOrdinate format>

In [28]:
n_es1, n_es2, n_prs, n_rd, n_pt = 3000, 8000, 2000, 3000, 1000
model = trainNN(x_train, x_test, y_train, y_test, HL=[50], Drop=0.5, OP=optimizers.Adam())
Yvl3 = model.predict(breakInput(X_val)).squeeze()
Yts3 = model.predict(breakInput(Xts)).squeeze()

NameError: name 'numFeatures' is not defined