In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import joblib

#my own functions cleaning raw text
import preprocessing as pps
import getkeywords as gkw



In [27]:
#read in data
df = pd.read_csv("../../data/All_Data.csv", index_col=0)
df.shape

(1984, 8)

In [3]:
#write data distribution across states to a csv file
pd.DataFrame(df['state'].value_counts()).to_csv("../metrics/state_distribution.csv")

In [28]:
#filter data (get rid of the jobtitles that do not go into the model)
df = df[df["jobtitle"] != 'Machine Learning']
#df = df[df["jobtitle"] != 'Data Architect']
#df = df[df["jobtitle"] != 'Data Analyst']
#df = df[df["jobtitle"] != 'Data Engineer']
df.shape

(1960, 8)

In [29]:
#look at the jobtitles left
set(df['jobtitle'])

{'Business Analyst',
 'Data Analyst',
 'Data Architect',
 'Data Engineer',
 'Data Scientist',
 'Database Administrator',
 'Product Manager'}

In [34]:
#split and preprocess data
data_train, data_test = pps.kfold_split(df, k=5)
train_feature = pps.raw_cleaning(data_train['snippet'], False)
train_labels = data_train['jobtitle']

In [35]:
#build pipeline including a transformer and a classifier
text_clf = Pipeline([('vect', TfidfVectorizer(max_df = 0.99, min_df = 0.01,ngram_range=(1,3))),
                     ('clf', RandomForestClassifier(n_estimators=200)),
])
text_clf = text_clf.fit(train_feature, train_labels)

In [36]:
#see how the pipeline performs on test data
test_feature = pps.raw_cleaning(data_test['snippet'], False)
predicted = text_clf.predict(test_feature)
test_labels = data_test['jobtitle']
#output = pd.DataFrame(predicted, test_labels)
print(np.mean(predicted == test_labels)) 
#print(output)
confusion = pd.DataFrame(confusion_matrix(predicted, test_labels, labels=list(set(test_labels))), columns = list(set(test_labels)))
confusion = confusion.div(confusion.sum(axis=0), axis=1)
f1s = metrics.classification_report(test_labels, predicted)

0.691326530612


In [37]:
print(f1s)

                        precision    recall  f1-score   support

      Business Analyst       0.66      0.73      0.69        86
          Data Analyst       0.45      0.55      0.50        47
        Data Architect       0.71      0.56      0.63        27
         Data Engineer       0.71      0.45      0.56        33
        Data Scientist       0.76      0.75      0.75        55
Database Administrator       0.82      0.76      0.79        70
       Product Manager       0.75      0.78      0.77        74

           avg / total       0.70      0.69      0.69       392



In [38]:
confusion

Unnamed: 0,Database Administrator,Product Manager,Data Scientist,Data Engineer,Data Analyst,Business Analyst,Data Architect
0,0.757143,0.0,0.0,0.151515,0.06383,0.023256,0.074074
1,0.014286,0.783784,0.054545,0.060606,0.042553,0.116279,0.037037
2,0.028571,0.013514,0.745455,0.121212,0.106383,0.011628,0.0
3,0.014286,0.0,0.018182,0.454545,0.021277,0.0,0.111111
4,0.057143,0.054054,0.145455,0.090909,0.553191,0.116279,0.111111
5,0.1,0.135135,0.018182,0.060606,0.212766,0.732558,0.111111
6,0.028571,0.013514,0.018182,0.060606,0.0,0.0,0.555556


In [11]:
#write the confusion matrix to a csv file
#confusion.to_csv("../metrics/confusion_matrix_all.csv")
#dump the pipeline object to the target directory
joblib.dump(text_clf, "../JB_app/models/text_clf_medium.pkl")

['../JB_app/models/text_clf_medium.pkl']

In [39]:
#test on one entry
new_data = [data_test['snippet'].iloc[1]]
new_feature = pps.raw_cleaning(new_data, False)
prediction = text_clf.predict_proba(new_feature)
output = pd.DataFrame()
output['jobtitle'] = text_clf.classes_
output['probability'] = prediction[0]
output = output.sort_values(by='probability', ascending=False)
output

Unnamed: 0,jobtitle,probability
0,Business Analyst,0.525
6,Product Manager,0.19
1,Data Analyst,0.075
5,Database Administrator,0.065
4,Data Scientist,0.06
2,Data Architect,0.05
3,Data Engineer,0.035


In [40]:
#calculate a document * label table with each element being the probability of a document being a label
likelihoods = pd.DataFrame(columns = list(set(df['jobtitle'])))
for i in range(df.shape[0]):
    new = [df['snippet'].iloc[i]]
    feature = pps.raw_cleaning(new, False)
    prediction = text_clf.predict_proba(feature)[0]
    labels = text_clf.classes_
    likelihoods = likelihoods.append(pd.Series({label:ll for label, ll in zip(labels, prediction)}, name = str(i)))
likelihoods.head()

Unnamed: 0,Database Administrator,Product Manager,Data Scientist,Data Engineer,Data Analyst,Business Analyst,Data Architect
0,0.045,0.1,0.7,0.03,0.008,0.092,0.025
1,0.01,0.045,0.885,0.01,0.025,0.02,0.005
2,0.005,0.015,0.838333,0.061667,0.06,0.01,0.01
3,0.08,0.14,0.175,0.03,0.276667,0.263333,0.035
4,0.005,0.015,0.025,0.945,0.005,0.0,0.005


In [41]:
likelihoods.to_csv("../metrics/likelihoods_table_all.csv")

In [42]:
ll_table = pd.read_csv("../metrics/likelihoods_table_all.csv", index_col=0)
ll_table.head()

Unnamed: 0,Database Administrator,Product Manager,Data Scientist,Data Engineer,Data Analyst,Business Analyst,Data Architect
0,0.045,0.1,0.7,0.03,0.008,0.092,0.025
1,0.01,0.045,0.885,0.01,0.025,0.02,0.005
2,0.005,0.015,0.838333,0.061667,0.06,0.01,0.01
3,0.08,0.14,0.175,0.03,0.276667,0.263333,0.035
4,0.005,0.015,0.025,0.945,0.005,0.0,0.005


In [43]:
prototypes = gkw.get_prototypes(ll_table)
prototypes

{'Business Analyst': [1414,
  1413,
  935,
  1551,
  1883,
  1161,
  1307,
  1559,
  1133,
  928],
 'Data Analyst': [222, 214, 437, 438, 1801, 1804, 1334, 219, 1339, 722],
 'Data Architect': [1477, 557, 555, 403, 402, 1476, 894, 1250, 223, 978],
 'Data Engineer': [1461, 803, 97, 805, 463, 465, 90, 38, 92, 466],
 'Data Scientist': [383, 81, 77, 1854, 1857, 155, 1858, 1856, 1855, 420],
 'Database Administrator': [570,
  566,
  501,
  684,
  1920,
  918,
  1752,
  1703,
  916,
  1258],
 'Product Manager': [631, 1489, 1745, 1819, 232, 1603, 756, 58, 826, 1201]}

In [44]:
tfs = pd.read_csv("../metrics/All_tfidf_features_99.csv", index_col=0)
tfs.head()

Unnamed: 0,company,jobtitle,jobtitle_orig,ability,able,access,accounting,accredited,acquisition,across,...,wide,within,without,work,workflow,working,world,writing,written,year
21,0.0,Data Scientist,Data Scientist - Risk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,Data Scientist,Data Scientist - Operations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.114881,0.0,0.0,0.0,0.0,0.0,0.123433
39,0.0,Data Scientist,Data Scientist,0.0,0.0,0.247118,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.250241,0.0,0.0,0.0
40,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,Data Engineer,Data Engineer – Information Management & Analy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.198704,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
result = {}
features = tfs.iloc[:,3:]
words = list(features.columns)
label = 'Data Scientist'
tmp = pd.DataFrame()
tmp['words'] = words
tmp['tfidf'] = list(features.iloc[prototypes[label],:].sum(axis=0))
tmp = tmp.sort_values(by = 'tfidf', ascending = False)
tmp['words'].iloc[:20]

26        analytics
513     statistical
332     mathematics
22         analysis
512       statistic
15         advanced
185     engineering
480         science
105        computer
345           model
368    optimization
365       operation
394       personnel
462        research
165      discipline
24         analytic
338          method
335          medium
441    quantitative
356         network
Name: words, dtype: object

In [46]:
prototypes.keys()

dict_keys(['Database Administrator', 'Product Manager', 'Data Analyst', 'Business Analyst', 'Data Scientist', 'Data Architect', 'Data Engineer'])

In [47]:
keywords = gkw.get_keywords(tfs.iloc[:,3:], prototypes, 30)

In [48]:
keywords.iloc[:20,:]

Unnamed: 0,Database Administrator,Product Manager,Data Analyst,Business Analyst,Data Scientist,Data Architect,Data Engineer
0,database,product,data,business,analytics,architecture,data
1,backup,market,health,requirement,statistical,data,stack
2,server,strategy,creative,document,mathematics,strategy,aws
3,dba,manager,ad,analyst,analysis,architect,azure
4,configuration,plan,use,analysis,statistic,network,pipeline
5,certification,pricing,office,test,advanced,implementation,setting
6,storage,team,including,time,engineering,standard,technology
7,performs,responsibility,development,functional,science,ensure,variety
8,implementation,new,quality,within,computer,system,mining
9,maintaining,direct,ability,position,model,technical,computer


In [49]:
keywords.to_csv("../JB_app/keywords_all_30.csv")

In [50]:
keywords = pd.read_csv("../JB_app/keywords_all_30.csv", index_col=0)
keywords

Unnamed: 0,Database Administrator,Product Manager,Data Analyst,Business Analyst,Data Scientist,Data Architect,Data Engineer
0,database,product,data,business,analytics,architecture,data
1,backup,market,health,requirement,statistical,data,stack
2,server,strategy,creative,document,mathematics,strategy,aws
3,dba,manager,ad,analyst,analysis,architect,azure
4,configuration,plan,use,analysis,statistic,network,pipeline
5,certification,pricing,office,test,advanced,implementation,setting
6,storage,team,including,time,engineering,standard,technology
7,performs,responsibility,development,functional,science,ensure,variety
8,implementation,new,quality,within,computer,system,mining
9,maintaining,direct,ability,position,model,technical,computer


In [51]:
gkw.common_keywords('Data Architect','Data Analyst',keywords)

['data', 'development']

In [52]:
new = [df['snippet'].iloc[1]]
cleaned_text = pps.raw_cleaning(new, False).iloc[0]
cleaned_words = list(set(cleaned_text.split()))
gkw.contributing_words(cleaned_words, keywords)

{'Business Analyst': 'year',
 'Data Analyst': 'client',
 'Data Architect': 'strategy, predictive, technical, experience',
 'Data Engineer': 'computer, client, related, field, engineering, science, implement',
 'Data Scientist': 'quantitative, computer, advanced, operation, learning, analytics, research, engineering, science',
 'Database Administrator': 'implement',
 'Product Manager': 'strategy, work, develop, experience, team, research'}