In [82]:
import numpy as np
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [3]:
jobs = pd.read_csv('jobs.csv')

In [4]:
jobs.head()

Unnamed: 0,Title,Company,Location,Description,total_pay,Simple_Title,Job_Level
0,lead data architect,me,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",159750.0,Other,Middle
1,data wrangling expert/asset performance analyst,peak services,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle
2,data scientist / data analyst,randstad - technologies,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle
3,data analyst,nab,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle
4,analyst chapter lead (cloud and big data,anz,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle


### Sort the salary into three bins


In [5]:
bin = []
for i in jobs.total_pay:
    if i < 180000 :
        bin.append(0)
    elif i >= 180000 and i < 205000:
        bin.append(1)
    else:
        bin.append(2)

In [6]:
jobs['pay_bin'] = bin

In [7]:
jobs.head()

Unnamed: 0,Title,Company,Location,Description,total_pay,Simple_Title,Job_Level,pay_bin
0,lead data architect,me,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",159750.0,Other,Middle,0
1,data wrangling expert/asset performance analyst,peak services,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle,1
2,data scientist / data analyst,randstad - technologies,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle,1
3,data analyst,nab,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle,1
4,analyst chapter lead (cloud and big data,anz,melbourne,"div data-automation=""mobiletemplate"" class=""_2...",201500.0,Analyst,Middle,1


### Find the baseline

In [8]:
baseline = jobs.pay_bin.value_counts().max()/len(jobs.pay_bin) * 100
print(baseline)

51.36540962288687


## PREDICT SALARY FROM DESCRIPTION

In [31]:
X = jobs.iloc[:, 3]
y = jobs.iloc[:, -1]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

### Set up the train and test sets for both Count Vectorizer and TF-TDF

In [33]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1,3))
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [35]:
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [36]:
X_train_tfidf = tfidf_vec.transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

### Create a function to try different models

In [37]:
def train_model(classifier, X_train, y_train, X_test):
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
    
    return accuracy_score(predictions, y_test)

### MULTINOMIAL NAIVE BAYES

In [38]:
accuracy = train_model(MultinomialNB(), X_train_cv, y_train, X_test_cv)
print ("NB, Count Vectors: ", accuracy)

accuracy = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print ("NB, TF-IDF: ", accuracy)

NB, Count Vectors:  0.5699481865284974
NB, TF-IDF:  0.5025906735751295


### LOGISTIC REGRESSION

In [39]:
accuracy = train_model(LogisticRegression(random_state=30), X_train_cv, y_train, X_test_cv)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(LogisticRegression(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("LR, TF-IDF: ", accuracy)

LR, Count Vectors:  0.6217616580310881
LR, TF-IDF:  0.5284974093264249


### SUPPORT VECTOR CLASSIFIER

In [40]:
accuracy = train_model(SVC(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("SVC, TF-IDF: ", accuracy)

SVC, TF-IDF:  0.5025906735751295


### RANDOM FOREST CLASSIFIER

In [41]:
accuracy = train_model(RandomForestClassifier(random_state=30), X_train_cv, y_train, X_test_cv)
print ("RF, Count Vectors: ", accuracy)

accuracy = train_model(RandomForestClassifier(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("RF, TF-IDF: ", accuracy)

RF, Count Vectors:  0.5958549222797928
RF, TF-IDF:  0.5181347150259067


## PREDICT SALARY FROM TITLE

In [65]:
X = jobs.iloc[:, 5]
y = jobs.iloc[:, -1]

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

In [67]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1,3))
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [68]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [69]:
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [70]:
X_train_tfidf = tfidf_vec.transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

In [71]:
def train_model(classifier, X_train, y_train, X_test):
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
      
    return accuracy_score(predictions, y_test)

In [72]:
accuracy = train_model(MultinomialNB(), X_train_cv, y_train, X_test_cv)
print ("NB, Count Vectors: ", accuracy)

accuracy = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print ("NB, TF-IDF: ", accuracy)

NB, Count Vectors:  0.5544041450777202
NB, TF-IDF:  0.5544041450777202


In [73]:
accuracy = train_model(LogisticRegression(random_state=30), X_train_cv, y_train, X_test_cv)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(LogisticRegression(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("LR, TF-IDF: ", accuracy)

LR, Count Vectors:  0.8290155440414507
LR, TF-IDF:  0.8290155440414507


In [74]:
accuracy = train_model(SVC(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("SVC, TF-IDF: ", accuracy)

SVC, TF-IDF:  0.8290155440414507


In [75]:
accuracy = train_model(RandomForestClassifier(random_state=30), X_train_cv, y_train, X_test_cv)
print ("RF, Count Vectors: ", accuracy)

accuracy = train_model(RandomForestClassifier(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("RF, TF-IDF: ", accuracy)

RF, Count Vectors:  0.8290155440414507
RF, TF-IDF:  0.8290155440414507


## PREDICT SALARY FROM LOCATION

In [77]:
X = jobs.iloc[:, 2]
y = jobs.iloc[:, -1]

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

In [79]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1,3))
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [80]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [81]:
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [82]:
X_train_tfidf = tfidf_vec.transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

In [83]:
def train_model(classifier, X_train, y_train, X_test):
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
      
    return accuracy_score(predictions, y_test)

In [84]:
accuracy = train_model(MultinomialNB(), X_train_cv, y_train, X_test_cv)
print ("NB, Count Vectors: ", accuracy)

accuracy = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print ("NB, TF-IDF: ", accuracy)

NB, Count Vectors:  0.5025906735751295
NB, TF-IDF:  0.5025906735751295


In [85]:
accuracy = train_model(LogisticRegression(random_state=30), X_train_cv, y_train, X_test_cv)
print ("LR, Count Vectors: ", accuracy)

accuracy = train_model(LogisticRegression(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("LR, TF-IDF: ", accuracy)

LR, Count Vectors:  0.5025906735751295
LR, TF-IDF:  0.5025906735751295


In [86]:
accuracy = train_model(SVC(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("SVC, TF-IDF: ", accuracy)

SVC, TF-IDF:  0.5025906735751295


In [87]:
accuracy = train_model(RandomForestClassifier(random_state=30), X_train_cv, y_train, X_test_cv)
print ("RF, Count Vectors: ", accuracy)

accuracy = train_model(RandomForestClassifier(random_state=30), X_train_tfidf, y_train, X_test_tfidf)
print ("RF, TF-IDF: ", accuracy)

RF, Count Vectors:  0.5025906735751295
RF, TF-IDF:  0.5025906735751295


## PREDICT SALARY FROM SUB GROUPS

In [43]:
salary = jobs[['Location', 'Simple_Title', 'Job_Level', 'pay_bin']]

In [55]:
salary = pd.get_dummies(salary)

In [56]:
salary.head()

Unnamed: 0,pay_bin,Location_melbourne,Location_sydney,Simple_Title_Analyst,Simple_Title_Consultant,Simple_Title_Engineer,Simple_Title_Other,Simple_Title_Scientist,Job_Level_Junior,Job_Level_Middle,Job_Level_Senior
0,0,1,0,0,0,0,1,0,0,1,0
1,1,1,0,1,0,0,0,0,0,1,0
2,1,1,0,1,0,0,0,0,0,1,0
3,1,1,0,1,0,0,0,0,0,1,0
4,1,1,0,1,0,0,0,0,0,1,0


In [57]:
X = salary.iloc[:, 1:]
y = salary.iloc[:, 0]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

In [63]:
def train_model(classifier, X_train, y_train, X_test):
    classifier.fit(X_train, y_train)
    
    train_predictions = classifier.predict(X_train)
    predictions = classifier.predict(X_test)
      
    return accuracy_score(predictions, y_test), accuracy_score(train_predictions, y_train)

In [72]:
accuracy = train_model(MultinomialNB(), X_train, y_train, X_test)
print ("NB test , train: ", accuracy)

NB test , train:  (0.927461139896373, 0.9340277777777778)


In [67]:
accuracy = train_model(LogisticRegression(random_state=30), X_train, y_train, X_test)
print ("LR test , train: ", accuracy)

LR test , train:  (0.927461139896373, 0.9340277777777778)


In [69]:
accuracy = train_model(SVC(random_state=30), X_train, y_train, X_test)
print ("SVC test , train: ", accuracy)

SVC test , train:  (0.9481865284974094, 0.9496527777777778)


In [70]:
accuracy = train_model(RandomForestClassifier(random_state=30), X_train, y_train, X_test)
print ("RF test , train: ", accuracy)

RF test , train:  (0.9481865284974094, 0.9496527777777778)


In [None]:

y_pred = logreg.predict(X_test)
print("The accuracy score of train:", logreg.score(X_train, y_train)*100)
print("The accuracy score of test:", logreg.score(X_test, y_test)*100)

In [81]:
lr = LogisticRegression(random_state=30)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
lr_counts = pd.DataFrame(lr.coef_, columns=X_train.columns, index=['LOW', 'MED', 'HIGH']).T

[[-0.51652999 -0.28160783 -1.33992869 -1.03099023 -1.4192096   4.22098127
  -1.22899057  3.12128939 -0.48202994 -3.43739727]
 [-0.30785534 -0.6114627   1.44833952  1.29494407  1.82612144 -2.97185202
  -2.51687105 -1.50420879  1.91667307 -1.33178233]
 [-0.3076616  -0.04227938 -0.50914327 -0.29340036 -0.7202364  -2.45373205
   3.62657111 -1.47516085 -1.65163085  2.77685073]]


In [79]:
lr_counts

Unnamed: 0,LOW,MED,HIGH
Location_melbourne,-0.51653,-0.307855,-0.307662
Location_sydney,-0.281608,-0.611463,-0.042279
Simple_Title_Analyst,-1.339929,1.44834,-0.509143
Simple_Title_Consultant,-1.03099,1.294944,-0.2934
Simple_Title_Engineer,-1.41921,1.826121,-0.720236
Simple_Title_Other,4.220981,-2.971852,-2.453732
Simple_Title_Scientist,-1.228991,-2.516871,3.626571
Job_Level_Junior,3.121289,-1.504209,-1.475161
Job_Level_Middle,-0.48203,1.916673,-1.651631
Job_Level_Senior,-3.437397,-1.331782,2.776851


In [83]:
Prec_score = classification_report(y_test, pred)
print(Prec_score)

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        62
           1       0.92      0.95      0.93        97
           2       0.88      0.85      0.87        34

   micro avg       0.93      0.93      0.93       193
   macro avg       0.92      0.91      0.92       193
weighted avg       0.93      0.93      0.93       193

