In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import joblib

#my own functions cleaning raw text
import preprocessing as pps
import getkeywords as gkw



In [31]:
#read in data
df = pd.read_csv("../../data/All_Data.csv", index_col=0)
df.shape

(1984, 8)

In [3]:
#write data distribution across states to a csv file
pd.DataFrame(df['state'].value_counts()).to_csv("../metrics/state_distribution.csv")

In [32]:
#filter data (get rid of the jobtitles that do not go into the model)
df = df[df["jobtitle"] != 'Machine Learning']
df = df[df["jobtitle"] != 'Data Architect']
df = df[df["jobtitle"] != 'Data Analyst']
#df = df[df["jobtitle"] != 'Data Engineer']
df.shape

(1507, 8)

In [33]:
#look at the jobtitles left
set(df['jobtitle'])

{'Business Analyst',
 'Data Engineer',
 'Data Scientist',
 'Database Administrator',
 'Product Manager'}

In [34]:
#split and preprocess data
data_train, data_test = pps.kfold_split(df, k=5)
train_feature = pps.raw_cleaning(data_train['snippet'], False)
train_labels = data_train['jobtitle']

In [35]:
#build pipeline including a transformer and a classifier
text_clf = Pipeline([('vect', TfidfVectorizer(max_df = 0.99, min_df = 0.01,ngram_range=(1,2))),
                     ('clf', RandomForestClassifier(n_estimators=200)),
])
text_clf = text_clf.fit(train_feature, train_labels)

In [36]:
#see how the pipeline performs on test data
test_feature = pps.raw_cleaning(data_test['snippet'], False)
predicted = text_clf.predict(test_feature)
test_labels = data_test['jobtitle']
#output = pd.DataFrame(predicted, test_labels)
print(np.mean(predicted == test_labels)) 
#print(output)
confusion = pd.DataFrame(confusion_matrix(predicted, test_labels, labels=list(set(test_labels))), columns = list(set(test_labels)))
confusion = confusion.div(confusion.sum(axis=0), axis=1)
f1s = metrics.classification_report(test_labels, predicted)

0.75415282392


In [37]:
print(f1s)

                        precision    recall  f1-score   support

      Business Analyst       0.76      0.88      0.82        83
         Data Engineer       0.75      0.37      0.49        41
        Data Scientist       0.79      0.74      0.76        57
Database Administrator       0.71      0.73      0.72        55
       Product Manager       0.75      0.88      0.81        65

           avg / total       0.75      0.75      0.74       301



In [38]:
confusion

Unnamed: 0,Data Scientist,Database Administrator,Product Manager,Business Analyst,Data Engineer
0,0.736842,0.036364,0.0,0.036145,0.146341
1,0.035088,0.727273,0.015385,0.036145,0.243902
2,0.122807,0.054545,0.876923,0.048193,0.121951
3,0.035088,0.163636,0.107692,0.879518,0.121951
4,0.070175,0.018182,0.0,0.0,0.365854


In [39]:
#write the confusion matrix to a csv file
#confusion.to_csv("../metrics/confusion_matrix_all.csv")
#dump the pipeline object to the target directory
joblib.dump(text_clf, "../JB_app/models/text_clf_medium.pkl")

['../JB_app/models/text_clf_medium.pkl']

In [40]:
#test on one entry
new_data = [data_test['snippet'].iloc[1]]
new_feature = pps.raw_cleaning(new_data, False)
prediction = text_clf.predict_proba(new_feature)
output = pd.DataFrame()
output['jobtitle'] = text_clf.classes_
output['probability'] = prediction[0]
output = output.sort_values(by='probability', ascending=False)
output

Unnamed: 0,jobtitle,probability
2,Data Scientist,0.69125
1,Data Engineer,0.13875
0,Business Analyst,0.105
3,Database Administrator,0.035
4,Product Manager,0.03


In [41]:
#calculate a document * label table with each element being the probability of a document being a label
likelihoods = pd.DataFrame(columns = list(set(df['jobtitle'])))
for i in range(df.shape[0]):
    new = [df['snippet'].iloc[i]]
    feature = pps.raw_cleaning(new, False)
    prediction = text_clf.predict_proba(feature)[0]
    labels = text_clf.classes_
    likelihoods = likelihoods.append(pd.Series({label:ll for label, ll in zip(labels, prediction)}, name = str(i)))
likelihoods.head()

Unnamed: 0,Business Analyst,Data Scientist,Product Manager,Database Administrator,Data Engineer
0,0.085,0.665,0.165,0.035,0.05
1,0.025,0.88,0.035,0.01,0.05
2,0.06,0.81,0.03,0.015,0.085
3,0.17,0.72,0.055,0.03,0.025
4,0.0,0.005,0.025,0.01,0.96


In [42]:
likelihoods.to_csv("../metrics/likelihoods_table_medium.csv")

In [43]:
ll_table = pd.read_csv("../metrics/likelihoods_table_medium.csv", index_col=0)
ll_table.head()

Unnamed: 0,Business Analyst,Data Scientist,Product Manager,Database Administrator,Data Engineer
0,0.085,0.665,0.165,0.035,0.05
1,0.025,0.88,0.035,0.01,0.05
2,0.06,0.81,0.03,0.015,0.085
3,0.17,0.72,0.055,0.03,0.025
4,0.0,0.005,0.025,0.01,0.96


In [44]:
prototypes = gkw.get_prototypes(ll_table)
prototypes

{'Business Analyst': [1179,
  1181,
  1412,
  1343,
  1103,
  639,
  1374,
  1445,
  810,
  1503],
 'Data Engineer': [337, 336, 588, 586, 584, 73, 713, 80, 334, 709],
 'Data Scientist': [412, 414, 773, 1429, 1014, 1426, 1427, 580, 326, 138],
 'Database Administrator': [614, 613, 408, 404, 1404, 957, 961, 745, 673, 178],
 'Product Manager': [1219, 1335, 248, 24, 159, 163, 399, 1390, 521, 1467]}

In [45]:
tfs = pd.read_csv("../metrics/All_tfidf_features_99.csv", index_col=0)
tfs.head()

Unnamed: 0,company,jobtitle,jobtitle_orig,ability,ability develop,ability work,able,access,account,accounting,...,work experience,workflow,working,world,writing,written,year,year experience,year related,year relevant
21,0.0,Data Scientist,Data Scientist - Risk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,Data Scientist,Data Scientist - Operations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.109386,0.0,0.0,0.0
39,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.253007,0.0,0.0,...,0.0,0.0,0.0,0.232879,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,Data Engineer,Data Engineer – Information Management & Analy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
result = {}
features = tfs.iloc[:,3:]
words = list(features.columns)
label = 'Data Scientist'
tmp = pd.DataFrame()
tmp['words'] = words
tmp['tfidf'] = list(features.iloc[prototypes[label],:].sum(axis=0))
tmp = tmp.sort_values(by = 'tfidf', ascending = False)
tmp['words'].iloc[:20]

660                statistic
437              mathematics
32                 analytics
482       operation research
615                  science
622        science statistic
661    statistic mathematics
448                   method
259              engineering
135         computer science
438     mathematics computer
133                 computer
109                 clinical
434            master degree
582            related field
481                operation
597                 research
579                  related
433                   master
286               experience
Name: words, dtype: object

In [47]:
keywords = gkw.get_keywords(tfs.iloc[:,3:], prototypes, 100)

In [48]:
keywords.iloc[:20,:]

Unnamed: 0,Data Scientist,Data Engineer,Product Manager,Business Analyst,Database Administrator
0,statistic,data,product,business,database
1,mathematics,university computer,strategy,analyst,sql
2,analytics,kpmg,market,requirement,implementation
3,operation research,pipeline data,sale,business analyst,procedure
4,science,azure,manager,system,dba
5,science statistic,aws,plan,business system,support
6,statistic mathematics,stack,develop,analysis,backup recovery
7,method,data processing,revenue,design,creates
8,engineering,degree accredited,team develop,experience,database administrator
9,computer science,pipeline,pricing,need,production


In [49]:
keywords.to_csv("../JB_app/keywords_medium_100.csv")

In [51]:
keywords = pd.read_csv("../JB_app/keywords_medium_100.csv", index_col=0)
keywords

Unnamed: 0,Data Scientist,Data Engineer,Product Manager,Business Analyst,Database Administrator
0,statistic,data,product,business,database
1,mathematics,university computer,strategy,analyst,sql
2,analytics,kpmg,market,requirement,implementation
3,operation research,pipeline data,sale,business analyst,procedure
4,science,azure,manager,system,dba
5,science statistic,aws,plan,business system,support
6,statistic mathematics,stack,develop,analysis,backup recovery
7,method,data processing,revenue,design,creates
8,engineering,degree accredited,team develop,experience,database administrator
9,computer science,pipeline,pricing,need,production


In [53]:
gkw.common_keywords('Data Engineer','Data Scientist',keywords)

['data',
 'university computer',
 'azure',
 'accredited college',
 'college university',
 'data mining',
 'mining',
 'computer',
 'university',
 'accredited',
 'algorithm',
 'college',
 'science',
 'engineering related',
 'engineering',
 'field minimum',
 'related field',
 'year',
 'minimum',
 'computer science',
 'related',
 'two year',
 'field',
 'two',
 'degree',
 'experience',
 'year experience',
 'power',
 'personnel',
 'phd',
 'physic',
 'ability']

In [62]:
new = [df['snippet'].iloc[3]]
cleaned_text = pps.raw_cleaning(new, False)
cleaned_grams = get_grams(new, False)
gkw.contributing_words(cleaned_grams, keywords)

{'Business Analyst': 'analysis, design, experience, perform, required',
 'Data Engineer': 'experience, perform, plan, power, provide',
 'Data Scientist': 'analysis, clinical, design, experience, power, statistical',
 'Database Administrator': 'assist, design, experience, plan',
 'Product Manager': 'experience, plan'}