# <center> Predicting Donors Choose</center>

--------

# Introduction

## Imports

https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html

In [133]:
import gc
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import tqdm

In [134]:
import lightgbm as lgb

In [135]:
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import re

In [136]:
#I would rather not run the while dataset on my laptop
# so kaggle determines if I am also evaluating the kaggle test set
kaggle = False

#sim refers to if data simulating variable situations is being generated & tested on. 
sim = True

#Kaggle & Sim use 'test' so dont use both

In [137]:
train = pd.read_csv('./Input/train.csv', low_memory=False, index_col='id')
if kaggle: 
    if sim == False:
        test = pd.read_csv('./Input/test.csv', low_memory=False, index_col='id')

res = pd.read_csv('./Input/resources.csv', low_memory=False, index_col='id')
train['project_submitted_datetime'] = pd.to_datetime(train['project_submitted_datetime'])

In [138]:
def make_sim(n='p039565'):
    """
    Makes a semi-brute forced dataset based off the entry n, with variable:
    Months (jan-dec), 
    Pronouns (not teacher or dr), 
    Previous Entries(0:30),
    to see what combination may have a higher success rate
    """
    test = pd.DataFrame(train.loc[n]).transpose()
    del test['project_is_approved']
    
    change_dict = {"prefix":['Mrs.','Ms.','Mr.'],
        "date":['2017-01-26','2017-02-26','2017-03-26','2017-04-26','2017-05-26','2017-06-26',
        '2017-07-26','2017-08-26','2017-09-26','2017-10-26','2017-11-26','2017-12-26',],
        "prev" : [x for x in range(0,10)]}

    for i in range(100):
        test.loc[n+str(i)] = test.loc[n].transpose()
        test.set_value(n+str(i),'teacher_prefix',np.random.choice(change_dict['prefix']))
        test.set_value(n+str(i),'project_submitted_datetime',np.random.choice(change_dict['date']))
        test.set_value(n+str(i),'teacher_number_of_previously_posted_projects',np.random.choice(change_dict['prev']))
        
    test.index.rename = 'id'
    return(test)

In [139]:
#making a dataset which has variable situations that 'p039654' may be under
if sim:
    test = make_sim('p039565')
    train = train.drop('p039565',axis=0)
    del make_sim



## Preprocessing

### 1.1 Resource Intergration
Here we evaluate how much each project/proposal will cost and/or how big they are

In [140]:
res['cost'] = res['quantity'] * res['price']
res_agg = res.groupby('id').agg({'description': ['nunique'], 'quantity': ['sum'], 'cost': ['mean', 'sum']})
res_agg.columns = ['unique_items', 'total_quantity', 'mean_cost', 'total_cost']
res_agg.reset_index(inplace=True)

#description was dropped because the description of the project should not have an effect on its liklihood of success

In [141]:
train = train.merge(res_agg, left_index=True, right_on='id')
if kaggle: 
    if sim : 
        for i in res_agg.columns:
            if i != 'id':
                test[i] = res_agg[res_agg['id']=='p039565'].drop('id',axis=1)[i].values[0]
    else:
        test =  test.merge(res_agg, left_index=True, right_on='id')

del res_agg
del res

### 1.2 Preprocessing of features

#### EDA

In [None]:
#gen_mon = train.groupby(['datetime_month','gender'])['project_is_approved'].mean().reset_index()
#prev_mon = train.groupby('teacher_number_of_previously_posted_projects')['project_is_approved'].mean().reset_index()

#bins = pd.cut(train['teacher_number_of_previously_posted_projects'], [0, 25, 50, 75, 100, 125, 150])
#pros = train.groupby(bins)['project_is_approved'].agg('mean')

In [148]:
#sns.barplot(pros.index,pros)
#plt.title('Frequent Applicants')
#plt.xlabel("Teacher's Previous Postings")
#plt.ylabel("Mean Approvals")

In [149]:
#train.project_is_approved.value_counts()

In [150]:
#sns.lmplot(x="datetime_month", y="project_is_approved", hue="gender", 
#           data=gen_mon[gen_mon['gender']!='Unk'], 
#           order=3,ci=80)
#plt.title('Approval rate, by month, by gender (ci:90)')

In [153]:
#del gen_mon
#del prev_mon

In [154]:
gc.collect()

1440

#### Cat Preprocessing

There are 51 States, because of 50 + DC

In [155]:
train.isnull().sum()[train.isnull().sum()>0]

teacher_prefix          4
project_essay_3    175705
project_essay_4    175705
dtype: int64

In [156]:
train.teacher_prefix[train.teacher_prefix.isnull()] = 'Teacher'
try: 
    test.teacher_prefix[test.teacher_prefix.isnull()] = 'Teacher'
except:
    pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


fill na did not work

In [157]:
def date_prep(train):
    train['datetime_year'] = train['project_submitted_datetime'].dt.year
    train['datetime_month'] = train['project_submitted_datetime'].dt.month

    del train['project_submitted_datetime']
    del train['project_subject_subcategories']
    return(train)

In [158]:
def gender_features(train):
    conditions = [(train.teacher_prefix == 'Mr.'), 
                  (train.teacher_prefix == 'Mrs.') | (train.teacher_prefix == 'Ms.')]
    choices = ['Male', 'Female']
    train['gender'] = np.select(conditions, choices, default='Unk')
    return(train)

In [159]:
date_prep(train)
if kaggle: date_prep(test)

train = gender_features(train)
if kaggle: test = gender_features(test)

del gender_features
del date_prep

In [160]:
gc.collect()

338

##### Encoding labels

In [161]:
# Because of memory issues, it made more sense to encode everything as a string rather than dummies
cols  = [
    'teacher_id', 
    'gender',
    'teacher_prefix', 
    'school_state', 
    'project_grade_category',
    'project_subject_categories']

for c in tqdm.tqdm_notebook(cols):
    encod = LabelEncoder()
    encod.fit(train[c].astype(str))
    train[c] = encod.transform(train[c].astype(str))

if kaggle: 
    for c in tqdm.tqdm_notebook(cols):
        encod = LabelEncoder()
        encod.fit(test[c].astype(str))
        test[c] = encod.transform(test[c].astype(str))




In [162]:
del cols
del encod
del LabelEncoder
gc.collect()

0

#### Num Preprocessing

In [163]:
num_features  = ['teacher_number_of_previously_posted_projects','total_quantity', 'mean_cost', 'total_cost',]

In [164]:
SS = StandardScaler()
train[num_features] = SS.fit_transform(train[num_features])
if kaggle: test[num_features] = SS.fit_transform(test[num_features])

In [165]:
del num_features
del StandardScaler
del SS

In [166]:
gc.collect()

7

#### Text Preprocessing

  
#### Before May 17th, 2016:

- project_essay_1: "Introduce us to your classroom"
- project_essay_2: "Tell us more about your students"
- project_essay_3: "Describe how your students will use the materials you're requesting"
- project_essay_4: "Close by sharing why your project will make a difference"

#### May 17th, 2016 and beyond:

- project_essay_1: "Describe your students: What makes your students special? Specific details about their background, your neighborhood, and your school are all helpful."
- project_essay_2: "About your project: How will these materials make a difference in your students' learning and improve their school lives?"

#### Plan
- Combine essay_1 and essay_2 before May 17th to make "student_description" and use essay_1 after May 17th directly
- Combine essay_3 and essay_4 before May 17th to make "project_description" and use essay_2 after May 17th directly

In [167]:
def essay_convert(train):
    # Making the First essay : student_description
    train['student_description']=train['project_essay_1']
    train.loc[train.project_essay_3.notnull(),'student_description']=train.loc[train.project_essay_3.notnull(),'project_essay_1']+train.loc[train.project_essay_3.notnull(),'project_essay_2']

    # Making the second essay : project_description
    train['project_description']=train['project_essay_2']
    train.loc[train.project_essay_3.notnull(),'project_description']=train.loc[train.project_essay_3.notnull(),'project_essay_3']+train.loc[train.project_essay_3.notnull(),'project_essay_4']

    # Removing original essays
    del train['project_essay_1']
    del train['project_essay_2']
    del train['project_essay_3']
    del train['project_essay_4']
    return(train)

In [168]:
essay_convert(train)
if kaggle: essay_convert(test)
gc.collect()

del essay_convert

#### Lem & Tokenizer

In [None]:
for j in tqdm.tqdm_notebook(text_features):
    n_col = 'processed_'+j
    train[n_col] = train[j].apply(lambda x: scrub(x))
    if kaggle: test[n_col] = test[j].apply(lambda x: scrub(x))

gc.collect()
for i in text_features:
    del train[i]
    if kaggle: del test[i]

del stopwords
del other_stopwords
del text_features

In [None]:
text_features = ['project_title', 'project_resource_summary',
                'project_description', 'student_description']

In [None]:
other_stopwords = [x for x in 'abcdefghijklmnopqrstuvwxyz']
for j in ['student','students','education',]:
    other_stopwords.append(j)

In [None]:
#import gensim
from nltk.corpus import stopwords
def scrub(text):
    text = text.strip().lower()
    text = re.sub('\W+',' ', text)    
    text = re.sub(r'_', ' ', text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\:', ' ', text)
    text = re.sub(r'\+', ' ', text)
    text = re.sub(r'\=', ' ', text)
    text = re.sub(r'(\")', ' ', text)
    text = re.sub(r'(\r)', ' ', text)
    text = re.sub(r'(\n)', ' ', text)
    text = re.sub(r'(\\)', ' ', text)
    text = re.sub('n t ','n\'t ', text)
    text = re.sub(' re ',' are ', text)
    text = re.sub(r'(\r\n)', ' ', text)
    text = re.sub(r'\"\"\"\"', ' ', text)
    text = re.sub(' i m ',' i\'m ', text)
    text = ' '.join([word for word in text.split() if word not in (other_stopwords + stopwords.words("english"))])
    return(text)

In [None]:
gc.collect()

## Modeling

### Tfidf & X,y Assignment

In [None]:
gc.collect()

In [None]:
cols = [
    'processed_project_title',
    'processed_project_resource_summary', 
    'processed_project_description',
    'processed_student_description']

In [None]:
for i in cols:
    print("Average length in {} is {} words".format(i,str(round(train[i].str.len().mean()))))

In [None]:
n_features = [
    200, 
    400, 
    2500,
    2500]

In [None]:
gc.collect()

In [None]:
for c_i, c in tqdm.tqdm_notebook(enumerate(cols)):
    tfidf = TfidfVectorizer(
#        ngram_range=(1,3)
        max_features=n_features[c_i])

    tfidf.fit(train[c])

#    tfidf_train2 = tfidf.transform(train[c])

    tfidf_train = np.array(tfidf.transform(train[c].values).toarray(), dtype=np.float16)
    for i in range(n_features[c_i]):
        train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
    if kaggle:
        tfidf_test = np.array(tfidf.transform(test[c].values).toarray(), dtype=np.float16)
        for i in range(n_features[c_i]):
            test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

for i in cols:
    del train[i]
    if kaggle: del test[i]

In [None]:
train = train.reset_index()
del train['index']

In [None]:
drop_cols = ['project_is_approved','unique_items','id','teacher_id']

X = train.drop(drop_cols, axis=1)
y = train['project_is_approved']

#del train

In [544]:
#if running demo, uncomment this and change test['id'].vaues -> test['index'].values
#test = test.reset_index()

In [545]:
X_test = test.drop(drop_cols, axis=1, errors='ignore')
id_test = test['id'].values
feature_names = list(X.columns)

### Grid Search over LGBM for best params without training

In [None]:
model = lgb.LGBMClassifier( 
    boosting_type="gbdt",
    is_unbalance=True, 
    random_state=10,
    bagging_freq=5, 
    learning_rate=0.025,
    min_child_samples=3,
    verbose=1)

In [None]:
params_opt = {'n_estimators':  range(50, 100, 5),
             'max_depth': range(15, 30),
             'feature_fraction': [x / 1000.0 for x in range(825,900,25)],
             'bagging_fraction': [x / 1000.0 for x in range(825,900,25)],
             'num_leaves':range(20,50,5)}

In [None]:
rs = RandomizedSearchCV(
    model, 
    params_opt, 
    n_iter=10,
    scoring='roc_auc',
    verbose=1,
    cv=3)
rs.fit(X,y)
rs.best_score_

In [None]:
train.head()

### Training Best LGBM

In [576]:
# Build the model
cnt = 0
n_splits = 5
n_repeats = 1
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=42)

for train_index, valid_index in kf.split(X):
    print('Fold {}/{}'.format(cnt + 1, n_splits))
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 20,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'min_gain_to_split': 3,}  

    lgb_train = lgb.Dataset(
        X.loc[train_index], 
        y.loc[train_index], 
        feature_name=feature_names)

    lgb_valid = lgb.Dataset(
        X.loc[valid_index], 
        y.loc[valid_index])

    model = lgb.train(
        params, lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=100,
        verbose_eval=10,)

    if cnt == 0:
        importance = model.feature_importance()
        model_fnames = model.feature_name()
        tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
        tuples = [x for x in tuples if x[1] > 0]
        print('Important features:')
        for i in range(60):
            if i < len(tuples):
                print(tuples[i])
            else:
                break
    
    cnt = cnt+1

Fold 1/5
Training until validation scores don't improve for 100 rounds.
[10]	training's auc: 0.849842	valid_1's auc: 0.581566
[20]	training's auc: 0.870429	valid_1's auc: 0.6117
[30]	training's auc: 0.891184	valid_1's auc: 0.598214
[40]	training's auc: 0.909963	valid_1's auc: 0.58324
[50]	training's auc: 0.912539	valid_1's auc: 0.59375
[60]	training's auc: 0.926975	valid_1's auc: 0.600539
[70]	training's auc: 0.939209	valid_1's auc: 0.613653
[80]	training's auc: 0.945251	valid_1's auc: 0.608445
[90]	training's auc: 0.953835	valid_1's auc: 0.600446
[100]	training's auc: 0.966959	valid_1's auc: 0.593564
[110]	training's auc: 0.970933	valid_1's auc: 0.594866
[120]	training's auc: 0.975635	valid_1's auc: 0.595982
[130]	training's auc: 0.979078	valid_1's auc: 0.591704
[140]	training's auc: 0.982636	valid_1's auc: 0.588914
[150]	training's auc: 0.983641	valid_1's auc: 0.590402
[160]	training's auc: 0.986657	valid_1's auc: 0.579427
Early stopping, best iteration is:
[65]	training's auc: 0.935

In [547]:
#p = model.predict(X_test, num_iteration=model.best_iteration)

In [580]:
idf = tfidf.idf_
idf_map = dict(zip(tfidf.get_feature_names(), idf))

In [587]:
tfidf.get_feature_names()[75]

'accelerated'

In [588]:
tfidf.idf_[75]

6.52246041819533

In [581]:
#idf_map

{'00': 6.810142490647111,
 '000': 6.116995310087166,
 '10': 5.13616605707544,
 '100': 3.9769291465908947,
 '10th': 6.116995310087166,
 '11': 5.962844630259907,
 '110': 6.52246041819533,
 '12': 4.864232341591798,
 '125': 6.52246041819533,
 '12th': 6.29931686688112,
 '13': 5.829313237635384,
 '14': 5.829313237635384,
 '15': 6.116995310087166,
 '150': 6.29931686688112,
 '16': 6.29931686688112,
 '17': 6.116995310087166,
 '18': 5.4238481295272205,
 '19': 5.962844630259907,
 '1st': 5.13616605707544,
 '20': 4.91302250576123,
 '200': 6.52246041819533,
 '2016': 5.6061696863211745,
 '2017': 6.29931686688112,
 '21': 5.962844630259907,
 '21st': 4.650658241293739,
 '22': 5.711530201979001,
 '23': 5.6061696863211745,
 '24': 5.13616605707544,
 '25': 5.200704578213011,
 '250': 6.52246041819533,
 '26': 6.52246041819533,
 '28': 6.116995310087166,
 '2nd': 4.864232341591798,
 '30': 5.075541435259004,
 '300': 6.52246041819533,
 '35': 6.52246041819533,
 '3rd': 4.96431580014878,
 '40': 5.269697449699962,
 '4

In [548]:
submit = pd.DataFrame({'id':id_test,'pred':p})

In [567]:
#submit.sort_values('pred',axis=0,)

Unnamed: 0,id,pred
0,p039565,0.711640
28,p03956527,0.711640
37,p03956536,0.711640
26,p03956525,0.711640
44,p03956543,0.711640
49,p03956548,0.711640
52,p03956551,0.711640
22,p03956521,0.711640
53,p03956552,0.711640
55,p03956554,0.711640
