In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import joblib

#my own functions cleaning raw text
import preprocessing as pps
import getkeywords as gkw



In [2]:
#read in data
df = pd.read_csv("../../data/All_Data.csv", index_col=0)
df.shape

(1983, 8)

In [3]:
#write data distribution across states to a csv file
pd.DataFrame(df['state'].value_counts()).to_csv("../metrics/state_distribution.csv")

In [4]:
#filter data (get rid of the jobtitles that do not go into the model)
df = df[df["jobtitle"] != 'Machine Learning']
#df = df[df["jobtitle"] != 'Data Architect']
#df = df[df["jobtitle"] != 'Data Analyst']
#df = df[df["jobtitle"] != 'Data Engineer']
df.shape

(1959, 8)

In [5]:
#look at the jobtitles
set(df['jobtitle'])

{'Business Analyst',
 'Data Analyst',
 'Data Architect',
 'Data Engineer',
 'Data Scientist',
 'Database Administrator',
 'Product Manager'}

In [14]:
#split and preprocess data
data_train, data_test = pps.kfold_split(df, k=5)
train_feature = pps.raw_cleaning(data_train['snippet'], False, False)
train_labels = data_train['jobtitle']

In [15]:
#build pipeline including a transformer and a classifier
text_clf = Pipeline([('vect', TfidfVectorizer(max_df = 0.99, min_df = 0.01,ngram_range=(1,2))),
                     ('clf', RandomForestClassifier(n_estimators=200)),
])
text_clf = text_clf.fit(train_feature, train_labels)

In [16]:
#see how the pipeline performs on test data
test_feature = pps.raw_cleaning(data_test['snippet'], False)
predicted = text_clf.predict(test_feature)
test_labels = data_test['jobtitle']
#output = pd.DataFrame(predicted, test_labels)
print(np.mean(predicted == test_labels)) 
#print(output)
confusion = pd.DataFrame(confusion_matrix(predicted, test_labels, labels=list(set(test_labels))), columns = list(set(test_labels)))
#confusion = confusion.div(confusion.sum(axis=0), axis=1)
f1s = metrics.classification_report(test_labels, predicted)

0.685421994885


In [17]:
print(f1s)

                        precision    recall  f1-score   support

      Business Analyst       0.62      0.83      0.71        77
          Data Analyst       0.50      0.36      0.42        53
        Data Architect       0.76      0.61      0.68        36
         Data Engineer       0.62      0.38      0.47        34
        Data Scientist       0.64      0.71      0.67        58
Database Administrator       0.81      0.73      0.77        59
       Product Manager       0.80      0.89      0.84        74

           avg / total       0.68      0.69      0.67       391



In [18]:
confusion

Unnamed: 0,Data Analyst,Data Architect,Database Administrator,Data Engineer,Product Manager,Business Analyst,Data Scientist
0,19,3,2,3,3,4,4
1,1,22,3,2,0,1,0
2,3,0,43,1,2,3,1
3,1,2,4,13,0,0,1
4,4,3,1,1,66,3,5
5,19,4,6,1,3,64,6
6,6,2,0,13,0,2,41


In [19]:
#write the confusion matrix to a csv file
confusion.to_csv("../metrics/confusion_matrix_all.csv")
#dump the pipeline object to the target directory
joblib.dump(text_clf, "../JB_app/models/text_clf_all.pkl")

In [45]:
#test on one entry
new_data = [data_test['snippet'].iloc[2]]
true_title = data_test['jobtitle'].iloc[2]
print(true_title)
new_feature = pps.raw_cleaning(new_data, False)
prediction = text_clf.predict_proba(new_feature)
output = pd.DataFrame()
output['jobtitle'] = text_clf.classes_
output['probability'] = prediction[0]
output = output.sort_values(by='probability', ascending=False)
output

Database Administrator


Unnamed: 0,jobtitle,probability
5,Database Administrator,0.87
0,Business Analyst,0.045
1,Data Analyst,0.02
2,Data Architect,0.02
3,Data Engineer,0.02
6,Product Manager,0.02
4,Data Scientist,0.005


In [13]:
#calculate a document * label table with each element being the probability of a document being a label
likelihoods = pd.DataFrame(columns = list(set(df['jobtitle'])))
for i in range(df.shape[0]):
    new = [df['snippet'].iloc[i]]
    feature = pps.raw_cleaning(new, False)
    prediction = text_clf.predict_proba(feature)[0]
    labels = text_clf.classes_
    likelihoods = likelihoods.append(pd.Series({label:ll for label, ll in zip(labels, prediction)}, name = str(i)))
likelihoods.head()

Unnamed: 0,Product Manager,Database Administrator,Data Engineer,Data Scientist,Data Analyst,Business Analyst,Data Architect
0,0.145,0.01,0.635,0.1,0.05,0.055,0.005
1,0.17,0.015,0.055,0.06,0.63,0.055,0.015
2,0.045,0.035,0.0325,0.08625,0.73,0.035,0.03625
3,0.095,0.025,0.028333,0.1,0.675,0.075,0.001667
4,0.03,0.705,0.045,0.02,0.09,0.07,0.04


In [15]:
likelihoods.to_csv("../metrics/likelihoods_table_all_titleoff.csv")

In [46]:
ll_table = pd.read_csv("../metrics/likelihoods_table_all_titleoff.csv", index_col=0)
ll_table.head()

Unnamed: 0,Product Manager,Database Administrator,Data Engineer,Data Scientist,Data Analyst,Business Analyst,Data Architect
0,0.145,0.01,0.635,0.1,0.05,0.055,0.005
1,0.17,0.015,0.055,0.06,0.63,0.055,0.015
2,0.045,0.035,0.0325,0.08625,0.73,0.035,0.03625
3,0.095,0.025,0.028333,0.1,0.675,0.075,0.001667
4,0.03,0.705,0.045,0.02,0.09,0.07,0.04


In [47]:
prototypes = gkw.get_prototypes(ll_table, 20)
prototypes

{'Business Analyst': [1774,
  1558,
  1556,
  1763,
  1326,
  1562,
  932,
  1564,
  1568,
  1502,
  1418,
  1417,
  1567,
  1933,
  1129,
  1563,
  1165,
  1555,
  1130,
  1039],
 'Data Analyst': [1343,
  218,
  1338,
  226,
  223,
  225,
  438,
  214,
  345,
  1873,
  771,
  659,
  613,
  1253,
  524,
  301,
  520,
  1679,
  471,
  1146],
 'Data Architect': [117,
  1254,
  1818,
  823,
  175,
  177,
  535,
  534,
  118,
  1107,
  122,
  1347,
  1947,
  479,
  119,
  1484,
  1351,
  409,
  820,
  406],
 'Data Engineer': [101,
  469,
  1465,
  94,
  807,
  809,
  467,
  811,
  960,
  956,
  470,
  96,
  42,
  963,
  959,
  715,
  719,
  434,
  965,
  1462],
 'Data Scientist': [1859,
  1860,
  1862,
  387,
  803,
  459,
  580,
  1043,
  578,
  81,
  654,
  1858,
  712,
  1861,
  586,
  1452,
  422,
  424,
  1045,
  750],
 'Database Administrator': [1924,
  505,
  255,
  920,
  1013,
  1643,
  1288,
  373,
  1707,
  370,
  1550,
  743,
  1262,
  1007,
  848,
  849,
  1708,
  375,
  1010,

In [55]:
tfs = pd.read_csv("../metrics/All_tfidf_features_99_all_titleoff.csv", index_col=0)
#tfs = pd.read_csv("../metrics/All_cond_features_all.csv", index_col=0)
tfs.head()

Unnamed: 0,company,jobtitle,jobtitle_orig,ability,ability develop,ability work,able,access,accounting,accredited,...,workflow,working,working knowledge,world,writing,written,year,year experience,year related,year relevant
58,0.214311,Data Engineer,Data Engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.181148,0.0,0.251677,0.0,0.0,0.0,0.0,0.0,0.0
75,0.0,Data Analyst,Data Analyst,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.0,Data Analyst,Data Analyst,0.0,0.0,0.0,0.0,0.61009,0.0,0.0,...,0.0,0.0,0.0,0.308901,0.0,0.0,0.0,0.0,0.0,0.0
79,0.0,Data Analyst,Data Analyst,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,0.0,Database Administrator,Database Administrator,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
result = {}
features = tfs.iloc[:,3:]
words = list(features.columns)
label = 'Product Manager'
tmp = pd.DataFrame()
tmp['words'] = words
tmp['feature'] = list(features.iloc[prototypes[label],:].sum(axis=0))
tmp = tmp.sort_values(by = 'feature', ascending = False)
tmp['words'].iloc[:20]

602                 strategy
381                   market
549                     sale
382                marketing
462                     plan
175                  develop
364                     line
480                  pricing
622                     team
347                   launch
141                 customer
416                      new
124                  content
624             team develop
361                     life
561                  service
182               developing
117                  concept
445                  partner
237    experience developing
Name: words, dtype: object

In [57]:
keywords = gkw.get_keywords(tfs.iloc[:,3:], prototypes, 100)

In [58]:
keywords.iloc[:20,:]

Unnamed: 0,Data Scientist,Data Engineer,Business Analyst,Product Manager,Data Architect,Database Administrator,Data Analyst
0,statistic,technology,requirement,strategy,design,design,reporting
1,mathematics,pipeline,functional,market,implementation,implementation,report
2,learning,processing,document,sale,service,procedure,ad
3,machine learning,variety,user,marketing,tool,backup,reporting tool
4,computer,stack,credit,plan,integration,maintenance,state
5,machine,aws,process,develop,etl,aspect,including
6,model,azure,specification,line,firm,physical,tool
7,analytics,setting,case,pricing,strategy,system,develop
8,statistical,accredited college,developing,team,availability,design implementation,federal
9,master degree,computer computer,project,launch,year experience,server,deliverable


In [59]:
keywords.to_csv("../JB_app/keywords_all_titleoff_100.csv")

In [25]:
keywords = pd.read_csv("../JB_app/keywords_all_titleoff_100.csv", index_col=0)
keywords

Unnamed: 0,Data Architect,Database Administrator,Product Manager,Data Scientist,Data Engineer,Business Analyst,Data Analyst
0,design,database,product,statistic,technology,requirement,reporting
1,implementation,design,strategy,mathematics,pipeline,functional,report
2,service,database design,market,learning,processing,document,ad
3,tool,implementation,sale,machine learning,variety,user,reporting tool
4,integration,procedure,marketing,computer,stack,credit,state
5,etl,backup,plan,machine,azure,process,including
6,firm,maintenance,develop,model,aws,specification,tool
7,availability,aspect,product strategy,analytics,setting,case,develop
8,strategy,physical,product line,statistical,accredited college,developing,federal
9,year experience,design implementation,line,master degree,college university,project,deliverable


In [26]:
gkw.common_keywords('Data Engineer','Data Architect',keywords)

['technology',
 'azure',
 'setting',
 'implement',
 'computer',
 'computer related',
 'bachelor',
 'related field',
 'degree',
 'bachelor degree',
 'related',
 'year',
 'field',
 'experience',
 'design',
 'team',
 'year experience',
 'development',
 'demonstrated',
 'hand',
 'tool',
 'platform',
 'part',
 'intelligence',
 'service',
 'degree computer',
 'solution',
 'system']

In [27]:
new = [df['snippet'].iloc[3]]
cleaned_text = pps.raw_cleaning(new, False)
cleaned_grams = pps.get_grams(new, False)
gkw.contributing_words(cleaned_grams, keywords)

{'Business Analyst': 'field, infrastructure, need, state, use',
 'Data Analyst': 'access, field, state, tool, use',
 'Data Architect': 'access, field, tool',
 'Data Engineer': 'art, expert, field, infrastructure, tool, use',
 'Data Scientist': 'field, tool',
 'Database Administrator': 'field, policy, tool',
 'Product Manager': 'expert, field, leader, need, support, tool'}