In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

import re

import pickle
from tqdm import tqdm
import os

from chart_studio import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
from pathlib2 import Path


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from scipy.sparse import hstack


In [None]:
%matplotlib inline

In [None]:
import pdb; pdb.set_trace()

## Loading Data

In [None]:
path = Path('data')
list(path.iterdir())

In [None]:
df = pd.read_csv(path/'preprocessed_data.csv')
print(df.shape)
df.head()

In [None]:
y = df['project_is_approved'].values
X = df.drop(['project_is_approved'], axis=1)
X.head()

### Splitting data into Train and cross validation(or test): Stratified Sampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)
print(X_train.shape, y_train.shape), print(X_cv.shape, y_cv.shape), print(X_test.shape, y_test.shape)



### Make Data Model Ready: encoding eassay, and project_title

In [None]:
vectorizer = CountVectorizer(min_df=10, ngram_range=(1,4), max_features=5000)
X_train_essay_bow = vectorizer.fit_transform(X_train['essay'].values)
X_cv_essay_bow = vectorizer.transform(X_cv['essay'].values)
X_test_essay_bow = vectorizer.transform(X_test['essay'].values)

print(X_train_essay_bow.shape, y_train.shape)
print(X_cv_essay_bow.shape, y_cv.shape)
print(X_test_essay_bow.shape, y_test.shape)
print("="*100)


In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train['school_state'].values)

X_train_state_ohe = vectorizer.transform(X_train['school_state'].values)
X_cv_state_ohe = vectorizer.transform(X_cv['school_state'].values)
X_test_state_ohe = vectorizer.transform(X_test['school_state'].values)

print(X_train_state_ohe.shape, y_train.shape)
print(X_cv_state_ohe.shape, y_cv.shape)
print(X_test_state_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)

### encoding categorical features: teacher_prefix

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values)

X_train_teacher_ohe = vectorizer.transform(X_train['teacher_prefix'].values)
X_cv_teacher_ohe = vectorizer.transform(X_cv['teacher_prefix'].values)
X_test_teacher_ohe = vectorizer.transform(X_test['teacher_prefix'].values)

print(X_train_teacher_ohe.shape, y_train.shape)
print(X_cv_teacher_ohe.shape, y_cv.shape)
print(X_test_teacher_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)

### encoding categorical features: project_grade_categor

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_grade_category'].values)

X_train_project_ohe = vectorizer.transform(X_train['project_grade_category'].values)
X_cv_project_ohe = vectorizer.transform(X_cv['project_grade_category'].values)
X_test_project_ohe = vectorizer.transform(X_test['project_grade_category'].values)

print(X_train_project_ohe.shape, y_train.shape)
print(X_cv_project_ohe.shape, y_cv.shape)
print(X_test_project_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)

### Normalize price

In [None]:
normalizer = Normalizer()

normalizer.fit(X_train['price'].values.reshape(-1,1))

X_train_price_norm = normalizer.transform(X_train['price'].values.reshape(-1,1))
X_cv_price_norm = normalizer.transform(X_cv['price'].values.reshape(-1,1))
X_test_price_norm = normalizer.transform(X_test['price'].values.reshape(-1,1))

print(X_train_price_norm.shape, y_train.shape)
print(X_cv_price_norm.shape, y_cv.shape)
print(X_test_price_norm.shape, y_test.shape)
print("="*100)

### Normalize teacher_number_of_previously_posted_projects

In [None]:
normalizer = Normalizer()

normalizer.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))

X_train_prev_proj_norm = normalizer.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_cv_prev_proj_norm = normalizer.transform(X_cv['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_test_prev_proj_norm = normalizer.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))

print(X_train_prev_proj_norm.shape, y_train.shape)
print(X_cv_prev_proj_norm.shape, y_cv.shape)
print(X_test_prev_proj_norm.shape, y_test.shape)
print("="*100)

### Encoding categories and sub categories

In [None]:
vectorizer = CountVectorizer(min_df=10, ngram_range=(1,4), max_features=5000)
X_train_categories_bow = vectorizer.fit_transform(X_train['clean_categories'].values)
X_cv_categories_bow = vectorizer.transform(X_cv['clean_categories'].values)
X_test_categories_bow = vectorizer.transform(X_test['clean_categories'].values)

print(X_train_categories_bow.shape, y_train.shape)
print(X_cv_categories_bow.shape, y_cv.shape)
print(X_test_categories_bow.shape, y_test.shape)
print("="*100)


In [None]:
vectorizer = CountVectorizer(min_df=10, ngram_range=(1,4), max_features=5000)
X_train_subcategories_bow = vectorizer.fit_transform(X_train['clean_subcategories'].values)
X_cv_subcategories_bow = vectorizer.transform(X_cv['clean_subcategories'].values)
X_test_subcategories_bow = vectorizer.transform(X_test['clean_subcategories'].values)

print(X_train_subcategories_bow.shape, y_train.shape)
print(X_cv_subcategories_bow.shape, y_cv.shape)
print(X_test_subcategories_bow.shape, y_test.shape)
print("="*100)

In [None]:
X_tr = hstack((X_train_essay_bow, X_train_categories_bow, X_train_subcategories_bow, X_train_state_ohe, X_train_teacher_ohe, X_train_project_ohe, X_train_price_norm, X_train_prev_proj_norm))
X_cr = hstack((X_cv_essay_bow, X_cv_categories_bow, X_cv_subcategories_bow, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_project_ohe, X_cv_price_norm, X_cv_prev_proj_norm))
X_te = hstack((X_test_essay_bow, X_test_categories_bow, X_test_subcategories_bow, X_test_state_ohe, X_test_teacher_ohe, X_test_project_ohe, X_test_price_norm, X_test_prev_proj_norm))

print("Final Data matrix")
print(X_tr.shape, y_train.shape)
print(X_cr.shape, y_cv.shape)
print(X_te.shape, y_test.shape)
print("="*100)


### Applying KNN of the processed data

In [None]:
import pdb
def batch_predict(clf, data):
    pdb.set_trace()
    y_data_pred = []
    tr_loop = data.shape[0] - data.shape[0] % 1000
    for i in range(0, tr_loop, 1000):
        y_data_pred.extend(clf.predict_proba(data[i:i+1000])[:,1])
    if data.shape[0] % 1000 != 0:
        y_data_pred.extend(clf.predict_proba(data[tr_loop:])[:,1])
    
    return y_data_pred

In [None]:
train_auc = []
cv_auc = []
K = [3, 15, 25, 51, 101]
for i in tqdm(K):
    neigh = KNeighborsClassifier(n_neighbors=i, n_jobs=4)
    neigh.fit(X_tr, y_train)
    
    y_train_pred = batch_predict(neigh, X_tr)
    y_cv_pred = batch_predict(neigh, X_cr)
    
    train_auc.append(roc_auc_score(y_train, y_train_pred))
    cv_auc.append(roc_auc_score(y_cv, y_cv_pred))
    
plt.plot(K, train_auc, label='Train AUC')
plt.plot(K, cv_auc, label='CV AUC')

plt.scatter(K, train_auc, label='Train AUC points')
plt.scatter(K, cv_auc, label='CV AUC points')

plt.legend()
plt.xlabel("K: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()

