In [47]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score

Read Data

In [None]:
data = pd.read_csv('Job titles and industries.csv')

Count number of samples in each class

In [3]:
IT = data[data['industry']=='IT']['job title']
Marketing = data[data['industry']=='Marketing'] ['job title']
Education = data[data['industry']=='Education']['job title']
Accountancy = data[data['industry']=='Accountancy']['job title']

In [5]:
print(IT.shape)
print(Marketing.shape)
print(Education.shape)
print(Accountancy.shape)

(4746,)
(2031,)
(1435,)
(374,)


Encoder for labels and vectorizer to turn words into features

In [75]:
le = LabelEncoder()
y = le.fit_transform(data['industry'])
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['job title'])
print(X.shape)
print(y.shape)

(8586, 2045)
(8586,)


Test train split and apply smote on training data to deal with data imbalance

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print(X_train_res.shape)
print(y_train_res.shape)

(15216, 2045)
(15216,)


Testing on Naive Bayes

In [82]:
clf = MultinomialNB()
clf.fit(X_train_res,y_train_res)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.909778812572759


In [100]:
svm = SVC(kernel='linear')
svm.fit(X_train_res,y_train_res)
svm_y=svm.predict(X_test)
print(accuracy_score(y_test,svm_y))

0.9348079161816065


In [98]:
f1_score(y_test,svm_y, average='macro')

0.9070411109401024

In [99]:
confusion_matrix(y_test,svm_y)

array([[ 59,   2,   5,   3],
       [  2, 277,  11,  19],
       [  4,  11, 905,  22],
       [  4,  13,  16, 365]], dtype=int64)

In [94]:
from sklearn.externals import joblib
joblib.dump(svm, 'saved_model.pkl') 



['saved_model.pkl']

In [97]:
job = vectorizer.transform(['data scientist'])
industry = le.inverse_transform(svm.predict(job))
print(industry[0])

IT
