In [4]:
from pymongo import MongoClient
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.utils import check_X_y
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


#### Loading the Data

In [None]:
client = MongoClient()
client.list_database_names()

In [None]:
resumes = client.resumes.w2v
resumes.count()

In [None]:
df = pd.DataFrame(list(resumes.find({'Start Dates':{'$exists': 1}},{'_id': 0})))
df.head()

#### Categorizing "Analyst": 1, "Scientist": 2, "Engineer": 3

In [None]:
roles = []
def categorize(df):
    for y in df['y']:
        y = y.replace("analyst","1").replace("scientist","2").replace("engineer","3")
        roles.append(y)
    df['y'] = roles
    return df

df = categorize(df)

### W2V Model

In [35]:
# clean data
df = pickle.load(open('W2V_data.pkl', 'rb'))

df = df.convert_objects(convert_numeric=True)

X = df.drop(columns=['Scientist','Engineer','Analyst','y'])
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [38]:
X_train.head()

Unnamed: 0,Bachelors,Masters,PhD,Start Dates,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
14787,1,0,0,27.0,-0.055711,-0.403599,0.313543,0.181767,-0.42731,0.19701,...,0.361759,-0.215971,-0.188807,-0.902728,-0.100053,-0.391527,-0.509233,0.955836,0.252353,0.172097
6167,1,3,0,8.0,0.892006,-0.525411,-0.033214,0.167723,0.203309,0.32998,...,-0.020274,0.148613,-0.292893,-0.172215,-0.177787,-0.664932,-0.511434,0.314636,-0.214305,-0.368414
18099,1,1,0,4.0,0.749063,-0.458793,0.102064,0.098578,-0.584221,0.024375,...,0.441998,-0.241637,0.167389,-0.461004,-0.152522,-0.448428,-0.430003,0.229196,-0.140442,0.091828
19120,0,0,0,3.0,0.279101,-0.465726,0.302332,0.444643,-0.115447,-0.026083,...,0.305812,-0.049171,-0.656288,-0.586896,-0.115128,-0.537179,-0.434514,0.364723,-0.061841,-0.068892
5639,0,0,0,8.0,0.435678,-0.542815,0.316228,0.282474,-0.431641,0.236479,...,0.439138,-0.182045,0.006532,-1.011326,-0.103727,-0.4482,-0.361613,0.632105,0.138093,0.524672


#### Impute median values based on education level.

In [39]:
exp_fill_series = X_train.groupby(['Bachelors','PhD','Masters'])['Start Dates'].median().reset_index()

def impute_exp(X):
    merged_frame = pd.merge(X, exp_fill_series, how='left', 
                             left_on=['Bachelors','PhD','Masters'], 
                             right_on=['Bachelors','PhD','Masters'])

    merged_frame.rename(columns={'Start Dates_x': 'Start Dates', 'Start Dates_y': 'Median Years'}, inplace=True)
    # only fill the na values in Start Dates with the Median Years
    merged_frame['Start Dates'].fillna(merged_frame['Median Years'], inplace=True)
    merged_frame.drop(['Median Years'], axis=1, inplace=True)
    return merged_frame

X_train_imputed = impute_exp(X_train)
# impute values from train not test to make sure the method formula is consistent
X_test_imputed = impute_exp(X_test)

X_train_imputed.head()

Unnamed: 0,Bachelors,Masters,PhD,Start Dates,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
0,1,0,0,27.0,-0.055711,-0.403599,0.313543,0.181767,-0.42731,0.19701,...,0.361759,-0.215971,-0.188807,-0.902728,-0.100053,-0.391527,-0.509233,0.955836,0.252353,0.172097
1,1,3,0,8.0,0.892006,-0.525411,-0.033214,0.167723,0.203309,0.32998,...,-0.020274,0.148613,-0.292893,-0.172215,-0.177787,-0.664932,-0.511434,0.314636,-0.214305,-0.368414
2,1,1,0,4.0,0.749063,-0.458793,0.102064,0.098578,-0.584221,0.024375,...,0.441998,-0.241637,0.167389,-0.461004,-0.152522,-0.448428,-0.430003,0.229196,-0.140442,0.091828
3,0,0,0,3.0,0.279101,-0.465726,0.302332,0.444643,-0.115447,-0.026083,...,0.305812,-0.049171,-0.656288,-0.586896,-0.115128,-0.537179,-0.434514,0.364723,-0.061841,-0.068892
4,0,0,0,8.0,0.435678,-0.542815,0.316228,0.282474,-0.431641,0.236479,...,0.439138,-0.182045,0.006532,-1.011326,-0.103727,-0.4482,-0.361613,0.632105,0.138093,0.524672


#### Standard Scale

In [10]:
ssX = StandardScaler()
X_train_imputed_scaled = ssX.fit_transform(X_train_imputed)
X_test_imputed_scaled = ssX.transform(X_test_imputed)

#### Oversampling

In [11]:
from imblearn.over_sampling import ADASYN
X_tisa, y_ta = ADASYN(random_state=42).fit_sample(X_train_imputed_scaled,y_train)

#### Even score should be 1/(number of y).

In [12]:
dummy = DummyClassifier().fit(X_tisa, y_ta)
dummy.score(X_test_imputed_scaled,y_test)

0.32750427675346894

#### Save transformed data.

In [13]:
with open('ssX_W2V.pkl','wb') as file:
    pickle.dump(ssX,file)

#### One of the models I ran on AWS: Support Vector Classifier.

In [14]:
SVC_model = SVC()

params = {
    'C' : [1.5, 2.0, 2.5],
    'gamma' : [0.005, 0.01, 0.015],
    'probability': [True]
}

gridSVC = GridSearchCV(SVC_model, param_grid=params, cv=5, scoring='accuracy',verbose=20)
gridSVC.fit(X_tisa, y_ta)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1.5, gamma=0.005, probability=True ............................
[CV]  C=1.5, gamma=0.005, probability=True, score=0.8303142901377922, total= 5.4min
[CV] C=1.5, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.1min remaining:    0.0s


[CV]  C=1.5, gamma=0.005, probability=True, score=0.8089191700216786, total= 5.3min
[CV] C=1.5, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.0min remaining:    0.0s


[CV]  C=1.5, gamma=0.005, probability=True, score=0.8025704552493031, total= 5.1min
[CV] C=1.5, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 17.8min remaining:    0.0s


[CV]  C=1.5, gamma=0.005, probability=True, score=0.7895307418305715, total= 5.3min
[CV] C=1.5, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 23.7min remaining:    0.0s


[CV]  C=1.5, gamma=0.005, probability=True, score=0.7912343193433483, total= 5.3min
[CV] C=1.5, gamma=0.01, probability=True .............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 29.6min remaining:    0.0s


[CV]  C=1.5, gamma=0.01, probability=True, score=0.8718067812354854, total= 5.2min
[CV] C=1.5, gamma=0.01, probability=True .............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 35.3min remaining:    0.0s


[CV]  C=1.5, gamma=0.01, probability=True, score=0.8558377206565501, total= 5.1min
[CV] C=1.5, gamma=0.01, probability=True .............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 41.1min remaining:    0.0s


[CV]  C=1.5, gamma=0.01, probability=True, score=0.838649736760607, total= 5.1min
[CV] C=1.5, gamma=0.01, probability=True .............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 46.8min remaining:    0.0s


[CV]  C=1.5, gamma=0.01, probability=True, score=0.8347529812606473, total= 5.0min
[CV] C=1.5, gamma=0.01, probability=True .............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 52.4min remaining:    0.0s


[CV]  C=1.5, gamma=0.01, probability=True, score=0.8350627226266067, total= 5.0min
[CV] C=1.5, gamma=0.015, probability=True ............................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 58.0min remaining:    0.0s


[CV]  C=1.5, gamma=0.015, probability=True, score=0.9007586313670847, total= 5.0min
[CV] C=1.5, gamma=0.015, probability=True ............................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed: 63.6min remaining:    0.0s


[CV]  C=1.5, gamma=0.015, probability=True, score=0.8849489005884175, total= 5.2min
[CV] C=1.5, gamma=0.015, probability=True ............................


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 69.4min remaining:    0.0s


[CV]  C=1.5, gamma=0.015, probability=True, score=0.8710126974295448, total= 5.2min
[CV] C=1.5, gamma=0.015, probability=True ............................


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed: 75.3min remaining:    0.0s


[CV]  C=1.5, gamma=0.015, probability=True, score=0.8640235403438129, total= 5.1min
[CV] C=1.5, gamma=0.015, probability=True ............................


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed: 80.9min remaining:    0.0s


[CV]  C=1.5, gamma=0.015, probability=True, score=0.8661917299055288, total= 5.2min
[CV] C=2.0, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 86.7min remaining:    0.0s


[CV]  C=2.0, gamma=0.005, probability=True, score=0.8388295401764979, total= 5.0min
[CV] C=2.0, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 92.4min remaining:    0.0s


[CV]  C=2.0, gamma=0.005, probability=True, score=0.8203778259523072, total= 5.1min
[CV] C=2.0, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed: 98.1min remaining:    0.0s


[CV]  C=2.0, gamma=0.005, probability=True, score=0.8086094766181481, total= 5.4min
[CV] C=2.0, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 104.1min remaining:    0.0s


[CV]  C=2.0, gamma=0.005, probability=True, score=0.7997522069072325, total= 4.8min
[CV] C=2.0, gamma=0.005, probability=True ............................


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed: 109.5min remaining:    0.0s


[CV]  C=2.0, gamma=0.005, probability=True, score=0.8045531980796036, total= 5.3min
[CV] C=2.0, gamma=0.01, probability=True .............................
[CV]  C=2.0, gamma=0.01, probability=True, score=0.880631676730144, total= 5.2min
[CV] C=2.0, gamma=0.01, probability=True .............................
[CV]  C=2.0, gamma=0.01, probability=True, score=0.8663672963765872, total= 5.0min
[CV] C=2.0, gamma=0.01, probability=True .............................
[CV]  C=2.0, gamma=0.01, probability=True, score=0.8491793124806442, total= 5.4min
[CV] C=2.0, gamma=0.01, probability=True .............................
[CV]  C=2.0, gamma=0.01, probability=True, score=0.8431159981415518, total= 5.2min
[CV] C=2.0, gamma=0.01, probability=True .............................
[CV]  C=2.0, gamma=0.01, probability=True, score=0.8482267306798823, total= 5.1min
[CV] C=2.0, gamma=0.015, probability=True ............................
[CV]  C=2.0, gamma=0.015, probability=True, score=0.9088094132218609, total=

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 256.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.5, 2.0, 2.5], 'gamma': [0.005, 0.01, 0.015], 'probability': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=20)

In [15]:
gridSVC.best_params_

{'C': 2.5, 'gamma': 0.015, 'probability': True}

In [16]:
gridSVC.best_score_

0.9003066059648797

#### Plot learning curve to assess whether overfitting:

In [None]:
train_sizes, train_scores, test_scores = learning_curve(gridSVC, sampled_train_X, sampled_train_y, cv = 5)
ave_train_scores = train_scores.mean(axis=1)
ave_test_scores = test_scores.mean(axis=1)

learn_df = pd.DataFrame({
    'train_size': train_sizes,
    'train_score': ave_train_scores,
    'test_score': ave_test_scores
})

plt.plot(learn_df['train_size'], learn_df['train_score'], 'r--o', label='train scores')
plt.plot(learn_df['train_size'], learn_df['test_score'], 'b--x', label='test size')
plt.legend(loc='lower right')
plt.ylim(0.5,1)

plt.show()

#### Save transformed data and model.

In [13]:
with open('ssX_W2V.pkl','wb') as file:
    pickle.dump(ssX,file)
with open('gridSVC_w2v.pkl','wb') as file:
    pickle.dump(gridSVC,file)