In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import joblib

#my own functions cleaning raw text
import preprocessing as pps
import getkeywords as gkw



In [2]:
#read in data
df = pd.read_csv("../../data/All_Data.csv", index_col=0)
df.shape

(1984, 8)

In [3]:
#write data distribution across states to a csv file
pd.DataFrame(df['state'].value_counts()).to_csv("../metrics/state_distribution.csv")

In [3]:
#filter data (get rid of the jobtitles that do not go into the model)
df = df[df["jobtitle"] != 'Machine Learning']
df = df[df["jobtitle"] != 'Data Architect']
df = df[df["jobtitle"] != 'Data Analyst']
#df = df[df["jobtitle"] != 'Data Engineer']
df.shape

(1507, 8)

In [4]:
#look at the jobtitles left
set(df['jobtitle'])

{'Business Analyst',
 'Data Engineer',
 'Data Scientist',
 'Database Administrator',
 'Product Manager'}

In [5]:
#split and preprocess data
data_train, data_test = pps.kfold_split(df, k=5)
train_feature = pps.raw_cleaning(data_train['snippet'], False)
train_labels = data_train['jobtitle']

In [6]:
#build pipeline including a transformer and a classifier
text_clf = Pipeline([('vect', TfidfVectorizer(max_df = 0.99, min_df = 0.01,ngram_range=(1,2))),
                     ('clf', RandomForestClassifier(n_estimators=200)),
])
text_clf = text_clf.fit(train_feature, train_labels)

In [7]:
#see how the pipeline performs on test data
test_feature = pps.raw_cleaning(data_test['snippet'], False)
predicted = text_clf.predict(test_feature)
test_labels = data_test['jobtitle']
#output = pd.DataFrame(predicted, test_labels)
print(np.mean(predicted == test_labels)) 
#print(output)
confusion = pd.DataFrame(confusion_matrix(predicted, test_labels, labels=list(set(test_labels))), columns = list(set(test_labels)))
confusion = confusion.div(confusion.sum(axis=0), axis=1)
f1s = metrics.classification_report(test_labels, predicted)

0.730897009967


In [8]:
print(f1s)

                        precision    recall  f1-score   support

      Business Analyst       0.62      0.81      0.70        78
         Data Engineer       0.72      0.53      0.61        43
        Data Scientist       0.80      0.69      0.74        54
Database Administrator       0.82      0.77      0.79        60
       Product Manager       0.77      0.77      0.77        66

           avg / total       0.74      0.73      0.73       301



In [9]:
confusion

Unnamed: 0,Business Analyst,Data Engineer,Data Scientist,Database Administrator,Product Manager
0,0.807692,0.209302,0.12963,0.166667,0.181818
1,0.0,0.534884,0.12963,0.016667,0.015152
2,0.038462,0.116279,0.685185,0.016667,0.0
3,0.051282,0.093023,0.0,0.766667,0.030303
4,0.102564,0.046512,0.055556,0.033333,0.772727


In [10]:
#write the confusion matrix to a csv file
#confusion.to_csv("../metrics/confusion_matrix_all.csv")
#dump the pipeline object to the target directory
joblib.dump(text_clf, "../JB_app/models/text_clf_medium.pkl")

['../JB_app/models/text_clf_medium.pkl']

In [11]:
#test on one entry
new_data = [data_test['snippet'].iloc[1]]
new_feature = pps.raw_cleaning(new_data, False)
prediction = text_clf.predict_proba(new_feature)
output = pd.DataFrame()
output['jobtitle'] = text_clf.classes_
output['probability'] = prediction[0]
output = output.sort_values(by='probability', ascending=False)
output

Unnamed: 0,jobtitle,probability
4,Product Manager,0.385
2,Data Scientist,0.285
0,Business Analyst,0.185
1,Data Engineer,0.1
3,Database Administrator,0.045


In [12]:
#calculate a document * label table with each element being the probability of a document being a label
likelihoods = pd.DataFrame(columns = list(set(df['jobtitle'])))
for i in range(df.shape[0]):
    new = [df['snippet'].iloc[i]]
    feature = pps.raw_cleaning(new, False)
    prediction = text_clf.predict_proba(feature)[0]
    labels = text_clf.classes_
    likelihoods = likelihoods.append(pd.Series({label:ll for label, ll in zip(labels, prediction)}, name = str(i)))
likelihoods.head()

Unnamed: 0,Business Analyst,Data Engineer,Data Scientist,Database Administrator,Product Manager
0,0.095,0.055,0.65,0.1,0.1
1,0.02,0.035,0.895,0.005,0.045
2,0.07,0.065,0.82,0.015,0.03
3,0.14,0.0,0.77,0.045,0.045
4,0.015,0.92,0.045,0.005,0.015


In [13]:
likelihoods.to_csv("../metrics/likelihoods_table_all.csv")

In [14]:
ll_table = pd.read_csv("../metrics/likelihoods_table_all.csv", index_col=0)
ll_table.head()

Unnamed: 0,Business Analyst,Data Engineer,Data Scientist,Database Administrator,Product Manager
0,0.095,0.055,0.65,0.1,0.1
1,0.02,0.035,0.895,0.005,0.045
2,0.07,0.065,0.82,0.015,0.03
3,0.14,0.0,0.77,0.045,0.045
4,0.015,0.92,0.045,0.005,0.015


In [15]:
prototypes = gkw.get_prototypes(ll_table)
prototypes

{'Business Analyst': [1084, 1083, 685, 1489, 1177, 844, 849, 1360, 1349, 567],
 'Data Engineer': [586, 80, 73, 336, 334, 584, 1126, 31, 75, 709],
 'Data Scientist': [60, 1428, 1425, 414, 580, 463, 326, 412, 705, 651],
 'Database Administrator': [1299,
  488,
  363,
  404,
  613,
  408,
  614,
  959,
  1474,
  1298],
 'Product Manager': [248, 1390, 1335, 521, 159, 399, 730, 729, 41, 1219]}

In [17]:
tfs = pd.read_csv("../metrics/All_tfidf_features_99.csv", index_col=0)
tfs.head()

Unnamed: 0,company,jobtitle,jobtitle_orig,ability,ability develop,ability work,able,access,account,accounting,...,work experience,workflow,working,world,writing,written,year,year experience,year related,year relevant
21,0.0,Data Scientist,Data Scientist - Risk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,Data Scientist,Data Scientist - Operations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.109386,0.0,0.0,0.0
39,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.253007,0.0,0.0,...,0.0,0.0,0.0,0.232879,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,Data Scientist,Data Scientist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,Data Engineer,Data Engineer – Information Management & Analy...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
result = {}
features = tfs.iloc[:,3:]
words = list(features.columns)
label = 'Data Scientist'
tmp = pd.DataFrame()
tmp['words'] = words
tmp['tfidf'] = list(features.iloc[prototypes[label],:].sum(axis=0))
tmp = tmp.sort_values(by = 'tfidf', ascending = False)
tmp['words'].iloc[:20]

660                statistic
437              mathematics
434            master degree
433                   master
615                  science
135         computer science
25                  analysis
133                 computer
439    mathematics statistic
109                 clinical
452                  minimum
661    statistic mathematics
191                   degree
510                      phd
419         machine learning
418                  machine
641                   social
32                 analytics
401                 learning
622        science statistic
Name: words, dtype: object

In [23]:
keywords = gkw.get_keywords(tfs.iloc[:,3:], prototypes, 30)

In [24]:
keywords.iloc[:20,:]

Unnamed: 0,Data Engineer,Product Manager,Business Analyst,Database Administrator,Data Scientist
0,data,product,business,database,statistic
1,kpmg,strategy,requirement,backup,mathematics
2,pipeline data,sale,document,dba,master degree
3,azure,market,end,sql,master
4,aws,develop,enterprise,backup recovery,science
5,stack,revenue,analyst,recovery,computer science
6,data processing,plan,end user,procedure,analysis
7,pipeline,lifecycle,analysis,server,computer
8,setting,strategic,translate,creates,mathematics statistic
9,accredited college,new,process,support,clinical


In [27]:
keywords.to_csv("../JB_app/keywords_medium_30.csv")

In [28]:
keywords = pd.read_csv("../JB_app/keywords_medium_30.csv", index_col=0)
keywords

Unnamed: 0,Data Engineer,Product Manager,Business Analyst,Database Administrator,Data Scientist
0,data,product,business,database,statistic
1,kpmg,strategy,requirement,backup,mathematics
2,pipeline data,sale,document,dba,master degree
3,azure,market,end,sql,master
4,aws,develop,enterprise,backup recovery,science
5,stack,revenue,analyst,recovery,computer science
6,data processing,plan,end user,procedure,analysis
7,pipeline,lifecycle,analysis,server,computer
8,setting,strategic,translate,creates,mathematics statistic
9,accredited college,new,process,support,clinical


In [31]:
gkw.common_keywords('Data Engineer','Data Scientist',keywords)

['computer', 'science']

In [52]:
new = [df['snippet'].iloc[1]]
cleaned_text = pps.raw_cleaning(new, False).iloc[0]
cleaned_words = list(set(cleaned_text.split()))
gkw.contributing_words(cleaned_words, keywords)

{'Business Analyst': 'year',
 'Data Analyst': 'client',
 'Data Architect': 'strategy, predictive, technical, experience',
 'Data Engineer': 'computer, client, related, field, engineering, science, implement',
 'Data Scientist': 'quantitative, computer, advanced, operation, learning, analytics, research, engineering, science',
 'Database Administrator': 'implement',
 'Product Manager': 'strategy, work, develop, experience, team, research'}