# <center> Predicting Donors Choose</center>

--------

# Introduction

## Imports

https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html

In [200]:
import gc
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import tqdm

In [201]:
import lightgbm as lgb

In [202]:
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import re

In [203]:
desk = True

In [204]:
train = pd.read_csv('./Input/train.csv', low_memory=False, index_col='id')
if desk: test = pd.read_csv('./Input/test.csv', low_memory=False, index_col='id')

res = pd.read_csv('./Input/resources.csv', low_memory=False, index_col='id')

In [205]:
train = train[:500]
test = test[:500]

## Preprocessing

### 1.1 Resource Intergration
Here we evaluate how much each project/proposal will cost and/or how big they are

In [206]:
res['cost'] = res['quantity'] * res['price']
res_agg = res.groupby('id').agg({'description': ['nunique'], 'quantity': ['sum'], 'cost': ['mean', 'sum']})
res_agg.columns = ['unique_items', 'total_quantity', 'mean_cost', 'total_cost']
res_agg.reset_index(inplace=True)

#description was dropped because the description of the project should not have an effect on its liklihood of success

train = train.merge(res_agg, left_index=True, right_on='id')
if desk: test =  test.merge(res_agg, left_index=True, right_on='id')

del res_agg
del res

### 1.3 Preprocessing of features

#### Cat Preprocessing
- Improve states
    - States are 51 because of 50 + DC

In [207]:
train.isnull().sum()

teacher_id                                        0
teacher_prefix                                    0
school_state                                      0
project_submitted_datetime                        0
project_grade_category                            0
project_subject_categories                        0
project_subject_subcategories                     0
project_title                                     0
project_essay_1                                   0
project_essay_2                                   0
project_essay_3                                 482
project_essay_4                                 482
project_resource_summary                          0
teacher_number_of_previously_posted_projects      0
project_is_approved                               0
id                                                0
unique_items                                      0
total_quantity                                    0
mean_cost                                         0
total_cost  

In [208]:
train.teacher_prefix[train.teacher_prefix.isnull()] = 'Teacher'
if desk: test.teacher_prefix[test.teacher_prefix.isnull()] = 'Teacher'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


fill na did not work

In [209]:
def date_prep(train):
    train['project_submitted_datetime'] = pd.to_datetime(train['project_submitted_datetime'])
    train['datetime_year'] = train['project_submitted_datetime'].dt.year
    train['datetime_month'] = train['project_submitted_datetime'].dt.month

    del train['project_submitted_datetime']
    del train['project_subject_subcategories']
    return(train)

date_prep(train)
if desk: date_prep(test)

In [210]:
def gender_features(train):
    conditions = [(train.teacher_prefix == 'Mr.'), 
                  (train.teacher_prefix == 'Mrs.') | (train.teacher_prefix == 'Ms.')]
    choices = ['Male', 'Female']
    train['gender'] = np.select(conditions, choices, default='Unk')
    return(train)

In [211]:
train = gender_features(train)
if desk: test = gender_features(test)
del gender_features

In [212]:
#train.groupby(['datetime_month','gender'])['project_is_approved'].mean().reset_index()

In [213]:
#gen_mon.to_csv('gb_gender_month.csv')

In [214]:
#sns.lmplot(x="datetime_month", y="project_is_approved", hue="gender", data=gen_mon[gen_mon['gender']!='Unk'], order=4, ci=None, truncate=True)

In [215]:
gc.collect()

31

##### Encoding labels

In [216]:
# Because of memory issues, it made more sense to encode everything as a string rather than dummies
cols  = [
    'teacher_id', 
    'gender',
#    'datetime_year', already encoded
#    'datetime_month', already encoded
    'teacher_prefix', 
    'school_state', 
    'project_grade_category',
    'project_subject_categories']

for c in tqdm.tqdm_notebook(cols):
    encod = LabelEncoder()
    encod.fit(train[c].astype(str))
    train[c] = encod.transform(train[c].astype(str))

if desk: 
    for c in tqdm.tqdm_notebook(cols):
        encod = LabelEncoder()
        encod.fit(test[c].astype(str))
        test[c] = encod.transform(test[c].astype(str))

A Jupyter Widget




A Jupyter Widget




In [217]:
del cols
del encod
del LabelEncoder
gc.collect()

0

#### Num Preprocessing

In [218]:
num_features  = ['teacher_number_of_previously_posted_projects','total_quantity', 'mean_cost', 'total_cost',]

In [219]:
SS = StandardScaler()
train[num_features] = SS.fit_transform(train[num_features])
if desk: test[num_features] = SS.fit_transform(test[num_features])

In [220]:
del num_features
del StandardScaler
del SS

In [221]:
gc.collect()

14

#### Text Preprocessing

  
#### Before May 17th, 2016:

- project_essay_1: "Introduce us to your classroom"
- project_essay_2: "Tell us more about your students"
- project_essay_3: "Describe how your students will use the materials you're requesting"
- project_essay_4: "Close by sharing why your project will make a difference"

#### May 17th, 2016 and beyond:

- project_essay_1: "Describe your students: What makes your students special? Specific details about their background, your neighborhood, and your school are all helpful."
- project_essay_2: "About your project: How will these materials make a difference in your students' learning and improve their school lives?"

#### Plan
- Combine essay_1 and essay_2 before May 17th to make "student_description" and use essay_1 after May 17th directly
- Combine essay_3 and essay_4 before May 17th to make "project_description" and use essay_2 after May 17th directly

In [222]:
def essay_convert(train):
    # Making the First essay : student_description
    train['student_description']=train['project_essay_1']
    train.loc[train.project_essay_3.notnull(),'student_description']=train.loc[train.project_essay_3.notnull(),'project_essay_1']+train.loc[train.project_essay_3.notnull(),'project_essay_2']

    # Making the second essay : project_description
    train['project_description']=train['project_essay_2']
    train.loc[train.project_essay_3.notnull(),'project_description']=train.loc[train.project_essay_3.notnull(),'project_essay_3']+train.loc[train.project_essay_3.notnull(),'project_essay_4']

    # Removing original essays
    del train['project_essay_1']
    del train['project_essay_2']
    del train['project_essay_3']
    del train['project_essay_4']
    return(train)

In [223]:
essay_convert(train)
if desk: essay_convert(test)
gc.collect()
del essay_convert

### Lem & Tokenizer

In [224]:
text_features = ['project_title', 'project_resource_summary',
                'project_description', 'student_description']

In [225]:
other_stopwords = [x for x in 'abcdefghijklmnopqrstuvwxyz']
for j in ['student','students','education',]:
    other_stopwords.append(j)

In [226]:
#import gensim
from nltk.corpus import stopwords
def scrub(text):
    text = text.strip().lower()
    text = re.sub('\W+',' ', text)
    text = re.sub(r'(\")', ' ', text)
    text = re.sub(r'(\r)', ' ', text)
    text = re.sub(r'(\n)', ' ', text)
    text = re.sub(r'(\r\n)', ' ', text)
    text = re.sub(r'(\\)', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\:', ' ', text)
    text = re.sub(r'\"\"\"\"', ' ', text)
    text = re.sub(r'_', ' ', text)
    text = re.sub(r'\+', ' ', text)
    text = re.sub(r'\=', ' ', text)
    text = re.sub(' i m ',' i\'m ', text)
    text = re.sub('n t ','n\'t ', text)
    text = re.sub(' re ',' are ', text)
    text = ' '.join([word for word in text.split() if word not in (other_stopwords + stopwords.words("english"))])
    return(text)

In [227]:
gc.collect()

0

In [228]:
for j in tqdm.tqdm_notebook(text_features):
    n_col = 'processed_'+j
    train[n_col] = train[j].apply(lambda x: scrub(x))
    if desk: test[n_col] = test[j].apply(lambda x: scrub(x))

gc.collect()
for i in text_features:
    del train[i]
    if desk: del test[i]

del stopwords
del other_stopwords
del text_features

A Jupyter Widget




## Modeling

### Non_NN

#### Tfidf & X,y Assignment

In [229]:
gc.collect()

56

In [230]:
cols = [
    'processed_project_title',
    'processed_project_resource_summary', 
    'processed_project_description',
    'processed_student_description']

In [231]:
for i in cols:
    print("Average length in {} is {} words".format(i,str(round(train[i].str.len().mean()))))

Average length in processed_project_title is 24 words
Average length in processed_project_resource_summary is 80 words
Average length in processed_project_description is 513 words
Average length in processed_student_description is 402 words


In [232]:
n_features = [
    200, 
    400, 
    2500,
    2500]

In [233]:
gc.collect()

122

In [234]:
for c_i, c in tqdm.tqdm_notebook(enumerate(cols)):
    tfidf = TfidfVectorizer(
        max_features=n_features[c_i])

    tfidf.fit(train[c])

#    tfidf_train2 = tfidf.transform(train[c])

    tfidf_train = np.array(tfidf.transform(train[c].values).toarray(), dtype=np.float16)
    for i in range(n_features[c_i]):
        train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
    if desk:
        tfidf_test = np.array(tfidf.transform(test[c].values).toarray(), dtype=np.float16)
        for i in range(n_features[c_i]):
            test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

for i in cols:
    del train[i]
    if desk: del test[i]

A Jupyter Widget




In [258]:
train = train.reset_index()
del train['index']

In [251]:
drop_cols = ['project_is_approved','unique_items','id','teacher_id']

X = train.drop(drop_cols, axis=1)
y = train['project_is_approved']

#del train

In [261]:
X_test = test.drop(drop_cols, axis=1, errors='ignore')
id_test = test['id'].values
feature_names = list(X.columns)

#### Model Generation

#### Grid search over Gradient Boosting

In [239]:
#from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

In [240]:
#gb_tuned_parameters = {
#                        "n_estimators": [x for x in range(100, 400, 2)],
#                        'min_samples_split' : [3],
#                        'max_depth': [20]}

In [241]:
#rs = RandomizedSearchCV(GradientBoostingClassifier(),
#                        gb_tuned_parameters,
#                        n_iter=30, 
#                        scoring='roc_auc', 
#                        cv=StratifiedKFold(), 
#                        verbose=10,)

In [242]:
#rs.fit(X, y)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] n_estimators=386, min_samples_split=3, max_depth=20 .............
[CV]  n_estimators=386, min_samples_split=3, max_depth=20, score=0.49051094890510955, total=   7.9s
[CV] n_estimators=386, min_samples_split=3, max_depth=20 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.9s remaining:    0.0s


[CV]  n_estimators=386, min_samples_split=3, max_depth=20, score=0.6020681265206813, total=   9.9s
[CV] n_estimators=386, min_samples_split=3, max_depth=20 .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.8s remaining:    0.0s


KeyboardInterrupt: 

#### Grid Search over LGBM

In [243]:
model = lgb.LGBMClassifier( 
    boosting_type="gbdt",
    is_unbalance=True, 
    random_state=10,
    bagging_freq=5, 
    learning_rate=0.025,
    min_child_samples=3,
    verbose=1)

In [244]:
params_opt = {'n_estimators':  range(50, 100, 5),
             'max_depth': range(15, 30),
             'feature_fraction': [x / 1000.0 for x in range(825,900,25)],
             'bagging_fraction': [x / 1000.0 for x in range(825,900,25)],
             'num_leaves':range(20,50,5)}

In [246]:
rs = RandomizedSearchCV(
    model, 
    params_opt, 
    n_iter=10,
    scoring='roc_auc',
    verbose=1,
    cv=3)
rs.fit(X,y)
rs.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   21.5s finished


0.59484470173672288

### Best LGBM

In [263]:
# Build the model
cnt = 0
n_splits = 5
n_repeats = 1
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=42)

for train_index, valid_index in kf.split(X):
    print('Fold {}/{}'.format(cnt + 1, n_splits))
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 20,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'num_threads': 1,
        'lambda_l2': 1.0,
        'min_gain_to_split': 3,
    }  

    lgb_train = lgb.Dataset(
        X.loc[train_index], 
        y.loc[train_index], 
        feature_name=feature_names)

    lgb_valid = lgb.Dataset(
        X.loc[valid_index], 
        y.loc[valid_index])

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=100,
        verbose_eval=10,)

    cnt = cnt+1

Fold 1/5
Training until validation scores don't improve for 100 rounds.
[10]	training's auc: 0.767922	valid_1's auc: 0.605263
[20]	training's auc: 0.795065	valid_1's auc: 0.595192
[30]	training's auc: 0.805931	valid_1's auc: 0.596816
[40]	training's auc: 0.819221	valid_1's auc: 0.611761
[50]	training's auc: 0.850108	valid_1's auc: 0.658869
[60]	training's auc: 0.864264	valid_1's auc: 0.646849
[70]	training's auc: 0.874156	valid_1's auc: 0.674464
[80]	training's auc: 0.893074	valid_1's auc: 0.658869
[90]	training's auc: 0.902511	valid_1's auc: 0.677713
[100]	training's auc: 0.918788	valid_1's auc: 0.688759
[110]	training's auc: 0.928268	valid_1's auc: 0.687459
[120]	training's auc: 0.936017	valid_1's auc: 0.688759
[130]	training's auc: 0.943117	valid_1's auc: 0.701105
[140]	training's auc: 0.945368	valid_1's auc: 0.712151
[150]	training's auc: 0.952814	valid_1's auc: 0.717349
[160]	training's auc: 0.959177	valid_1's auc: 0.717349
[170]	training's auc: 0.964459	valid_1's auc: 0.720598
[1

In [264]:
p = model.predict(X_test, num_iteration=model.best_iteration)

In [270]:
submit = pd.DataFrame({'id':id_test,'pred':p})