### LOAD DATA

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/train.csv")
print(len(df), len(df['job_title'].unique()))

job_count = df.groupby('job_title').size()
df = df.replace(job_count[job_count < 10].index, np.nan)
df = df.dropna()
df = df.reset_index(drop=True)
print(len(df), len(df['job_title'].unique()))

15154 377
14574 239


### TEXT PREPROCESSING

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['job_title'])
y_class = le.classes_
X = df['description'].values

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV



### GRID SEARCH

In [91]:
# stopwords: english, ngram: (1, 2), alpha: 0.0002, Tfidf use

clf_pipe = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

grid_params = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': np.logspace(-1, -4, 10)
}

clf = GridSearchCV(clf_pipe, grid_params)
clf = clf.fit(X, y)

print(clf.best_params_, clf.best_score_)

{'vect__ngram_range': (1, 2), 'clf__alpha': 0.00021544346900318845, 'tfidf__use_idf': True} 0.6047070124879923


In [4]:
# stopwords: english, ngram: (1, 1), alpha: 0.00016, Tfidf use

clf2_pipe = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1) ,stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])

grid_params = {
    'clf__alpha': np.logspace(-4, -3, 10)
}

clf2 = GridSearchCV(clf2_pipe, grid_params)
clf2 = clf2.fit(X, y)

print(clf2.best_params_, clf2.best_score_)

{'clf__alpha': 0.001} 0.6031288596130094


In [17]:
y_class[clf2.predict(['hello python, machine learning'])]

array(['Data Scientist'], dtype=object)

### MODEL DUMP

In [7]:
from sklearn.externals import joblib
joblib.dump(clf2, '../flask_api/models/pred-model.pkl')

['../flask_api/models/pred-model.pkl']

In [30]:
joblib.dump(y_class, '../flask_api/models/y-class.pkl')

['../flask_api/models/y-class.pkl']

### PREDICT RESULT FORMAT

In [65]:
pred_prob = clf2.predict_proba(['hello python machine learning'])[0]

##### FORMAT 0 (best)

In [147]:
sorted_list = sorted(list(zip(y_class, pred_prob)), key=lambda x: x[1], reverse=True)
sorted_list[:5]

[('Data Scientist', 0.28147724748799013),
 ('Data Analyst', 0.2120212331540459),
 ('Business Analyst', 0.1201267330399222),
 ('Solution Architect', 0.04030680067200857),
 ('Quantitative Developer', 0.02836807540103587)]

In [144]:
responses = {
    'items': [],
}
index = 0

for job, prob in sorted_list:
    responses['items'].append({
        'rank': index+1,
        'job': job,
        'prob': prob
    })
    index += 1

In [146]:
json.dumps(responses)

'{"items": [{"prob": 0.28147724748799013, "job": "Data Scientist", "rank": 1}, {"prob": 0.2120212331540459, "job": "Data Analyst", "rank": 2}, {"prob": 0.1201267330399222, "job": "Business Analyst", "rank": 3}, {"prob": 0.04030680067200857, "job": "Solution Architect", "rank": 4}, {"prob": 0.02836807540103587, "job": "Quantitative Developer", "rank": 5}, {"prob": 0.021804659303114295, "job": "Automation Engineer", "rank": 6}, {"prob": 0.021480847359805072, "job": "Plumber", "rank": 7}, {"prob": 0.020384632447473395, "job": "Miner ", "rank": 8}, {"prob": 0.017279699251549076, "job": "RF Engineer", "rank": 9}, {"prob": 0.016702440175178947, "job": "Research Analyst", "rank": 10}, {"prob": 0.015660066611118604, "job": "Director Of Engineering", "rank": 11}, {"prob": 0.015519215358830697, "job": "Research Scientist", "rank": 12}, {"prob": 0.012777082435627954, "job": "QA Engineer", "rank": 13}, {"prob": 0.010673366194983982, "job": "Instructional Designer", "rank": 14}, {"prob": 0.00987852

##### FORMAT 1

In [123]:
result_dict = {}
for key, val, in zip(y_class, pred_prob):
    result_dict[key] = val

In [131]:
sorted_result = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)

In [132]:
json.dumps(sorted_result)

'[["Data Scientist", 0.28147724748799013], ["Data Analyst", 0.2120212331540459], ["Business Analyst", 0.1201267330399222], ["Solution Architect", 0.04030680067200857], ["Quantitative Developer", 0.02836807540103587], ["Automation Engineer", 0.021804659303114295], ["Plumber", 0.021480847359805072], ["Miner ", 0.020384632447473395], ["RF Engineer", 0.017279699251549076], ["Research Analyst", 0.016702440175178947], ["Director Of Engineering", 0.015660066611118604], ["Research Scientist", 0.015519215358830697], ["QA Engineer", 0.012777082435627954], ["Instructional Designer", 0.010673366194983982], ["Computer Programmer", 0.009878521337361418], ["IT Business Analyst", 0.008343710146618081], ["Technical Writer", 0.008006727126042876], ["Statistician", 0.0077637370560115875], ["Restaurant Manager", 0.007677189854270094], ["Verification Engineer", 0.007180632230819411], ["Electrical Engineer", 0.006865446957564361], ["Web Developer", 0.006180433905121656], ["Quantitative Analyst", 0.005639881

##### FORMAT 2

In [126]:
pred_job_list = []
pred_prob_lsit = []

for key, val in zip(y_class, pred_prob):
    pred_job_list.append(key)
    pred_prob_lsit.append(val)

In [127]:
pred_result = {}
pred_result['job'] = pred_job_list
pred_result['prob'] = pred_prob_lsit

In [129]:
json.dumps(pred_result)

'{"job": ["Accountant", "Accounts Payable Manager", "Aerospace Engineer ", "Android Developer", "Applications Manager", "Attorney", "Auto Mechanic", "Automation Engineer", "Automotive Technician", "Bank Reconciliation ", "Bank Teller", "Barista", "Billing Specialist", "Biologist", "Bookkeeper", "Brand Ambassador", "Bus Driver", "Business Analyst", "Business Development Manager", "CFO ", "CNC Programmer", "CPA", "Chef", "Civil Engineer", "Computer Programmer", "Computer Technician", "Criminal Investigator", "Data Analyst", "Data Entry Clerk", "Data Scientist", "Dental Hygienist", "Dentist", "Dietitian", "Digital Marketing Manager", "Director Of Construction", "Director Of Engineering", "Director Of Operations", "Dj", "Early Childhood Teacher", "Editor", "Electrical Engineer", "Electrician", "Electronics Engineer", "Electronics Technician", "End Web Developer", "Environmental Scientist", "Epidemiologist", "Esthetician", "Event Coordinator", "Event Planner", "Executive Chef", "Facilities 