In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import joblib

#my own functions cleaning raw text
import preprocessing as pps
import getkeywords as gkw



In [2]:
#read in data
df = pd.read_csv("../../data/All_Data.csv", index_col=0)
df.shape

(1984, 8)

In [3]:
#write data distribution across states to a csv file
pd.DataFrame(df['state'].value_counts()).to_csv("../metrics/state_distribution.csv")

In [4]:
#filter data (get rid of the jobtitles that do not go into the model)
df = df[df["jobtitle"] != 'Machine Learning']
#df = df[df["jobtitle"] != 'Data Architect']
#df = df[df["jobtitle"] != 'Data Analyst']
#df = df[df["jobtitle"] != 'Data Engineer']
df.shape

(1960, 8)

In [5]:
#look at the jobtitles
set(df['jobtitle'])

{'Business Analyst',
 'Data Analyst',
 'Data Architect',
 'Data Engineer',
 'Data Scientist',
 'Database Administrator',
 'Product Manager'}

In [13]:
#split and preprocess data
data_train, data_test = pps.kfold_split(df, k=5)
train_feature = pps.raw_cleaning(data_train['snippet'], False)
train_labels = data_train['jobtitle']

In [14]:
#build pipeline including a transformer and a classifier
text_clf = Pipeline([('vect', TfidfVectorizer(max_df = 0.99, min_df = 0.01,ngram_range=(1,2))),
                     ('clf', RandomForestClassifier(n_estimators=200)),
])
text_clf = text_clf.fit(train_feature, train_labels)

In [15]:
#see how the pipeline performs on test data
test_feature = pps.raw_cleaning(data_test['snippet'], False)
predicted = text_clf.predict(test_feature)
test_labels = data_test['jobtitle']
#output = pd.DataFrame(predicted, test_labels)
print(np.mean(predicted == test_labels)) 
#print(output)
confusion = pd.DataFrame(confusion_matrix(predicted, test_labels, labels=list(set(test_labels))), columns = list(set(test_labels)))
confusion = confusion.div(confusion.sum(axis=0), axis=1)
f1s = metrics.classification_report(test_labels, predicted)

0.673469387755


In [16]:
print(f1s)

                        precision    recall  f1-score   support

      Business Analyst       0.57      0.79      0.66        76
          Data Analyst       0.60      0.44      0.51        57
        Data Architect       0.84      0.37      0.52        43
         Data Engineer       0.61      0.50      0.55        38
        Data Scientist       0.71      0.73      0.72        55
Database Administrator       0.67      0.82      0.73        44
       Product Manager       0.80      0.86      0.83        79

           avg / total       0.69      0.67      0.66       392



In [17]:
confusion

Unnamed: 0,Business Analyst,Database Administrator,Data Analyst,Data Architect,Product Manager,Data Engineer,Data Scientist
0,0.789474,0.090909,0.263158,0.209302,0.113924,0.078947,0.090909
1,0.013158,0.818182,0.052632,0.232558,0.012658,0.052632,0.018182
2,0.118421,0.022727,0.438596,0.023256,0.0,0.052632,0.072727
3,0.013158,0.0,0.017544,0.372093,0.0,0.026316,0.0
4,0.039474,0.022727,0.070175,0.023256,0.860759,0.157895,0.036364
5,0.0,0.022727,0.035088,0.116279,0.012658,0.5,0.054545
6,0.026316,0.022727,0.122807,0.023256,0.0,0.131579,0.727273


In [18]:
#write the confusion matrix to a csv file
#confusion.to_csv("../metrics/confusion_matrix_all.csv")
#dump the pipeline object to the target directory
joblib.dump(text_clf, "../JB_app/models/text_clf_all.pkl")

['../JB_app/models/text_clf_all.pkl']

In [20]:
#test on one entry
new_data = [data_test['snippet'].iloc[1]]
true_title = data_test['jobtitle'].iloc[1]
print(true_title)
new_feature = pps.raw_cleaning(new_data, False)
prediction = text_clf.predict_proba(new_feature)
output = pd.DataFrame()
output['jobtitle'] = text_clf.classes_
output['probability'] = prediction[0]
output = output.sort_values(by='probability', ascending=False)
output

Business Analyst


Unnamed: 0,jobtitle,probability
0,Business Analyst,0.54
1,Data Analyst,0.195
3,Data Engineer,0.0925
4,Data Scientist,0.0775
5,Database Administrator,0.04
6,Product Manager,0.04
2,Data Architect,0.015


In [21]:
#calculate a document * label table with each element being the probability of a document being a label
likelihoods = pd.DataFrame(columns = list(set(df['jobtitle'])))
for i in range(df.shape[0]):
    new = [df['snippet'].iloc[i]]
    feature = pps.raw_cleaning(new, False)
    prediction = text_clf.predict_proba(feature)[0]
    labels = text_clf.classes_
    likelihoods = likelihoods.append(pd.Series({label:ll for label, ll in zip(labels, prediction)}, name = str(i)))
likelihoods.head()

Unnamed: 0,Business Analyst,Database Administrator,Data Analyst,Data Architect,Product Manager,Data Engineer,Data Scientist
0,0.305,0.08,0.06,0.05,0.215,0.145,0.145
1,0.025,0.005,0.015,0.005,0.06,0.035,0.855
2,0.035,0.005,0.06,0.035,0.005,0.07,0.79
3,0.255,0.045,0.25,0.04,0.2,0.025,0.185
4,0.01,0.0,0.015,0.023333,0.035,0.901667,0.015


In [22]:
likelihoods.to_csv("../metrics/likelihoods_table_all.csv")

In [23]:
ll_table = pd.read_csv("../metrics/likelihoods_table_all.csv", index_col=0)
ll_table.head()

Unnamed: 0,Business Analyst,Database Administrator,Data Analyst,Data Architect,Product Manager,Data Engineer,Data Scientist
0,0.305,0.08,0.06,0.05,0.215,0.145,0.145
1,0.025,0.005,0.015,0.005,0.06,0.035,0.855
2,0.035,0.005,0.06,0.035,0.005,0.07,0.79
3,0.255,0.045,0.25,0.04,0.2,0.025,0.185
4,0.01,0.0,0.015,0.023333,0.035,0.901667,0.015


In [24]:
prototypes = gkw.get_prototypes(ll_table)
prototypes

{'Business Analyst': [935,
  1841,
  1438,
  928,
  1091,
  1090,
  1754,
  1413,
  1414,
  1322],
 'Data Analyst': [336, 722, 721, 612, 1058, 1804, 1801, 1775, 968, 969],
 'Data Architect': [1250, 113, 819, 1814, 171, 173, 531, 530, 118, 114],
 'Data Engineer': [1461, 97, 803, 805, 465, 463, 90, 956, 807, 466],
 'Data Scientist': [1854, 77, 1857, 155, 1858, 799, 1855, 1856, 200, 455],
 'Database Administrator': [1009,
  566,
  1920,
  501,
  570,
  1262,
  916,
  251,
  1006,
  853],
 'Product Manager': [631, 1489, 1603, 998, 997, 1913, 1211, 761, 31, 1819]}

In [25]:
tfs = pd.read_csv("../metrics/All_tfidf_features_99_all.csv", index_col=0)
tfs.head()

Unnamed: 0,company,jobtitle,jobtitle_orig,ability,ability develop,ability work,able,access,accounting,accredited,...,workflow,working,working knowledge,world,writing,written,year,year experience,year related,year relevant
21,0.0,Data Scientist,Data Scientist - Risk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,Data Scientist,Data Scientist - Operations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.10848,0.0,0.0,0.0
39,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.232897,0.0,0.0,...,0.0,0.0,0.0,0.23584,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,Data Engineer,Data Engineer – Information Management & Analy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
result = {}
features = tfs.iloc[:,3:]
words = list(features.columns)
label = 'Data Scientist'
tmp = pd.DataFrame()
tmp['words'] = words
tmp['tfidf'] = list(features.iloc[prototypes[label],:].sum(axis=0))
tmp = tmp.sort_values(by = 'tfidf', ascending = False)
tmp['words'].iloc[:20]

35                analytics
676    statistical analysis
449             mathematics
27                 analysis
671               statistic
630       science statistic
266             engineering
489      operation research
19                 advanced
675             statistical
457                  method
461                 minimum
624                 science
446           master degree
140        computer science
184          data scientist
488               operation
138                computer
631               scientist
606                research
Name: words, dtype: object

In [27]:
keywords = gkw.get_keywords(tfs.iloc[:,3:], prototypes, 100)

In [28]:
keywords.iloc[:20,:]

Unnamed: 0,Business Analyst,Database Administrator,Data Engineer,Data Scientist,Data Analyst,Data Architect,Product Manager
0,business,database,data,analytics,data,service,product
1,requirement,sql,stack,statistical analysis,health,integration,sale
2,analyst,implementation,aws,mathematics,clinical,firm,strategy
3,document,maintenance,azure,analysis,program,availability,product manager
4,technical,dba,data processing,statistic,analysis,year experience,market
5,analysis,performance,degree accredited,science statistic,data analyst,implementation,manager
6,technology,monitoring,pipeline,engineering,determine,data,develop
7,experience,tuning,setting,operation research,information,year,customer
8,business analyst,upgrade,accredited college,advanced,ass,architecture,team
9,process,production,computer engineering,statistical,analysis data,project,concept


In [29]:
keywords.to_csv("../JB_app/keywords_all_100.csv")

In [30]:
keywords = pd.read_csv("../JB_app/keywords_all_100.csv", index_col=0)
keywords

Unnamed: 0,Business Analyst,Database Administrator,Data Engineer,Data Scientist,Data Analyst,Data Architect,Product Manager
0,business,database,data,analytics,data,service,product
1,requirement,sql,stack,statistical analysis,health,integration,sale
2,analyst,implementation,aws,mathematics,clinical,firm,strategy
3,document,maintenance,azure,analysis,program,availability,product manager
4,technical,dba,data processing,statistic,analysis,year experience,market
5,analysis,performance,degree accredited,science statistic,data analyst,implementation,manager
6,technology,monitoring,pipeline,engineering,determine,data,develop
7,experience,tuning,setting,operation research,information,year,customer
8,business analyst,upgrade,accredited college,advanced,ass,architecture,team
9,process,production,computer engineering,statistical,analysis data,project,concept


In [32]:
gkw.common_keywords('Data Architect','Data Scientist',keywords)

['data',
 'year',
 'experience',
 'related field',
 'computer science',
 'mathematics statistic',
 'computer',
 'related',
 'science',
 'field',
 'degree',
 'design',
 'mathematics',
 'statistic',
 'analytics',
 'engineering',
 'analysis']

In [34]:
new = [df['snippet'].iloc[3]]
cleaned_text = pps.raw_cleaning(new, False)
cleaned_grams = pps.get_grams(new, False)
gkw.contributing_words(cleaned_grams, keywords)

{'Business Analyst': 'analysis, design, experience',
 'Data Analyst': 'analysis, clinical, component, design, experience, outcome, provide, required, study',
 'Data Architect': 'analysis, design, experience, perform',
 'Data Engineer': 'experience, perform, plan, power, provide',
 'Data Scientist': 'analysis, design, experience, statistical',
 'Database Administrator': 'analysis, assist, design, experience, perform',
 'Product Manager': 'design, experience, plan'}