In [24]:
import numpy as np
import pandas as pd
from time import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [26]:
jobs = pd.read_csv('jobs.csv')

In [27]:
jobs.head()

Unnamed: 0,Title,Company,Location,Description,total_pay,Simple_Title,Job_Level
0,lead data architect,me,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",159750.0,Other,Middle
1,data wrangling expert/asset performance analyst,peak services,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle
2,data scientist / data analyst,randstad - technologies,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle
3,data analyst,nab,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle
4,analyst chapter lead (cloud and big data,anz,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle


### Find the baseline

In [28]:
baseline_title = jobs.Simple_Title.value_counts().max()/len(jobs.Simple_Title) * 100
baseline_title

54.22626788036411

### Predicting the Job Category from the description

In [29]:
X = jobs.Description.values

In [30]:
y = jobs.Simple_Title.values

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

### Set up the train and test sets for both Count Vectorizer and TF-TDF

In [32]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1,3))
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [33]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [34]:
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [35]:
X_train_tfidf = tfidf_vec.transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

### Create a function to try different models

In [36]:
def train_model(classifier, X_train, y_train, X_test):
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
      
    return accuracy_score(predictions, y_test)

### MULTINOMIAL NAIVE BAYES

In [37]:
accuracy = train_model(MultinomialNB(), X_train_cv, y_train, X_test_cv)
print ("NB, Count Vectors: ", accuracy)

accuracy = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print ("NB, TF-IDF: ", accuracy)

NB, Count Vectors:  0.6010362694300518
NB, TF-IDF:  0.5284974093264249


### LOGISTIC REGRESSION

In [38]:
accuracy = train_model(LogisticRegression(random_state=30), X_train_cv, y_train, X_test_cv)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(LogisticRegression(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("LR, TF-IDF: ", accuracy)

LR, Count Vectors:  0.7357512953367875
LR, TF-IDF:  0.5647668393782384


### SUPPORT VECTOR CLASSIFIER

In [39]:
accuracy = train_model(SVC(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("SVC, TF-IDF: ", accuracy)

SVC, TF-IDF:  0.5284974093264249


### RANDOM FOREST CLASSIFIER

In [40]:
accuracy = train_model(RandomForestClassifier(random_state=30), X_train_cv, y_train, X_test_cv)
print ("RF, Count Vectors: ", accuracy)

accuracy = train_model(RandomForestClassifier(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("RF, TF-IDF: ", accuracy)

RF, Count Vectors:  0.5854922279792746
RF, TF-IDF:  0.616580310880829


In [21]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(X_train)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
cv_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
cv = cv_vectorizer.fit_transform(X_train)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(cv)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
cv_feature_names = cv_vectorizer.get_feature_names()
print_top_words(lda, cv_feature_names, n_top_words)

Extracting tf-idf features for NMF...
done in 0.354s.
Extracting tf features for LDA...
done in 0.281s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.418s.

Topics in NMF model (Frobenius norm):
Topic #0: li ul strong experience business skills management team work working requirements analysis support role ability xa0 systems reporting amp stakeholders
Topic #1: br strong experience xa0 project ba master role work big skills analyst contract business company reporting client services working management
Topic #2: xa0 experience sales regarded skills entry management br ms working energy support outstanding industry global growth com successfully customers join
Topic #3: strong role research australia skills government work company experience www apply people position _blank month contract opportunity level recruitment target
Topic #4: analytics insights marketing customer digital business science team learning development 