In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('Data/Ultimate.csv')
train = pd.read_csv("Data/train.csv", parse_dates=["project_submitted_datetime"])
test = pd.read_csv("Data/test.csv", parse_dates=["project_submitted_datetime"])

#df.to_csv('Data/Ultimate.csv',index = False)

tfidf_para = {
    "sublinear_tf":True,
    "strip_accents":'unicode',
    "stop_words":"english",
    "analyzer":'word',
    "token_pattern":r'\w{1,}',
    #"ngram_range":(1,1),
    "dtype":np.float32,
    "norm":'l2',
    "min_df":5,
    "max_df":.9,
    "smooth_idf":False
}

# Thanks To
# https://www.kaggle.com/lopuhin/eli5-for-mercari
# https://www.kaggle.com/jagangupta/understanding-approval-donorschoose-eda-fe-eli5/notebook
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_col(col_name):
    return lambda x: x[col_name]

df["project_title_count"] = df["project_title"].copy()
textcols = ["text","project_resource_summary","project_title", "project_title_count","resource_description"]
vectorizer = FeatureUnion([
        ('text',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=20000,
            **tfidf_para,
            preprocessor=get_col('text'))),
        ('project_resource_summary',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            max_features=2000,
            preprocessor=get_col('project_resource_summary'))),
        ('project_title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            max_features=1500,
            preprocessor=get_col('project_title'))),
        ('project_title_count',CountVectorizer(
            ngram_range=(1, 2),
            max_features=1500,
            preprocessor=get_col('project_title_count'))),
        ('resource_description',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            max_features=2400,
            preprocessor=get_col('resource_description'))),
#         ('Non_text',DictVectorizer())
    ])

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import time
for c in textcols:
    df[c] = df[c].astype(str)
start_vect=time.time()
ready_df = vectorizer.fit_transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

from sklearn.preprocessing import LabelEncoder
import gc
print('Label Encoder...')
cols = [
    'teacher_id', 
    'teacher_prefix', 
    'school_state', 
    'project_grade_category', 
    'project_subject_categories', 
    'project_subject_subcategories'
]
from tqdm import tqdm
for c in tqdm(cols):
    le = LabelEncoder()
    le.fit(df[c].astype(str))
    df[c] = le.transform(df[c].astype(str))
    print(df[c].isna().any())
del le
gc.collect()

final_test = df[df.project_is_approved.isnull()]
traindex = list(set(df.index) - set(final_test.index))
testdex = list(set(df.index) - set(traindex))
to_drop = ['id','project_essay_1','project_essay_2','project_essay_3','project_essay_4',
           'project_resource_summary','project_submitted_datetime','project_title','resource_description',
          'text','project_is_approved',"project_title_count"]

final_test = final_test.drop(columns=to_drop)
x = df.loc[traindex,:].drop(columns=to_drop)
y = df.loc[traindex,'project_is_approved']

from scipy.sparse import hstack, csr_matrix
x = hstack([csr_matrix(x.values),ready_df[0:182080]])
final_test = hstack([csr_matrix(final_test.values),ready_df[182080:]])

x = x.tocsr()
final_test = final_test.tocsr()

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
import lightgbm as lgb
params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 14,
        'learning_rate': 0.05,
        'feature_fraction': 0.25,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'num_threads': 1,
        'lambda_l2': 1,
        'min_gain_to_split': 0
}
model = lgb.train(
            params,
            lgb.Dataset(x_train, y_train),
            num_boost_round=10000,
            valid_sets=[lgb.Dataset(x_test, y_test)],
            early_stopping_rounds=200,
            verbose_eval=100)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.781051
[200]	valid_0's auc: 0.794002
[300]	valid_0's auc: 0.797341
[400]	valid_0's auc: 0.798377
[500]	valid_0's auc: 0.799002
[600]	valid_0's auc: 0.799463
[700]	valid_0's auc: 0.799453
[800]	valid_0's auc: 0.799205
Early stopping, best iteration is:
[663]	valid_0's auc: 0.79963


In [None]:
s = model.predict(final_test, num_iteration=model.best_iteration)
d = {'id':test.id , 'project_is_approved':s}
submission = pd.DataFrame(data = d)

submission.to_csv('0420_LGB.csv',index = False)