# Job Titles
Given a job title like "junior data analyst" categorize it as one of "finance", "sales", or "technology".

In [1]:
%matplotlib inline
%matplotlib notebook
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
sns.set()

### Loading and inspecting dataset

In [2]:
df = pd.read_csv('data/test.csv').append(pd.read_csv('data/train.csv'))
df.head()

Unnamed: 0,job_title,job_category
0,data analyst manager,technology
1,junior sales manager,sales
0,underwriter manager,finance
1,mortgage data analyst,finance
2,junior underwriter,finance


In [3]:
job_map = {job: i for job, i in zip(df.loc[:, 'job_category'].unique(), range(3))}
df['job_category'] = df['job_category'].replace(job_map)
job_map

{'technology': 0, 'sales': 1, 'finance': 2}

### Preprocessing

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

class_encoder = OneHotEncoder(categories='auto')
text_vectorizer = CountVectorizer(stop_words='english')

X = text_vectorizer.fit_transform(df['job_title'].values).toarray()
y = class_encoder.fit_transform(df['job_category'].values.reshape(-1, 1)).toarray()
X[0], y[0]

(array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64), array([1., 0., 0.]))

### Building &amp; testing the model

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)

dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)

print('Accuracy on training set:', dt.score(X_train, y_train))
print('Accuracy on test set:', dt.score(X_test, y_test))

Accuracy on training set: 1.0
Accuracy on test set: 1.0


### Testing

In [35]:
job_map_inverse = {job_map[key]: key for key in job_map}

def predict(s):
    global dt, class_encoder, text_vectorizer, job_map_inverse
    ps = text_vectorizer.transform([s]).toarray()
    prediction = dt.predict(ps)
    predicted_class = class_encoder.inverse_transform(prediction)[0][0]
    return job_map_inverse[predicted_class]

to_test = [
    'data analyst',
    'sales manager',
    'underwriter'
]

for pred, x in zip(map(predict, to_test), to_test):
    print('"' + x + '" ->', '"' + pred + '"')

"data analyst" -> "technology"
"sales manager" -> "sales"
"underwriter" -> "finance"
