In [53]:
%matplotlib inline

import pandas as pd
import warnings 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer

warnings.filterwarnings('ignore')

In [54]:
df = pd.read_csv('../data/clean_jsm_titles.csv')
df = df[['clean_title', 'position_level_code']]
df.clean_title = df.clean_title.astype(str)
df.clean_title = df.clean_title.astype(str)
df.position_level_code = pd.to_numeric(df.position_level_code, errors='coerce')
df = df.dropna()

In [55]:
ps = PorterStemmer()

def stem_and_join(title):
    return '_'.join(ps.stem(word) for word in title.split())

df.clean_title = df.clean_title.apply(stem_and_join)
df = df[df.groupby('clean_title').clean_title.transform(len) > 1]

In [56]:
position_level_codes = range(1,7)
corpus = [' '.join([title for title in df.clean_title[df.position_level_code == code]]) for code in position_level_codes] 

In [57]:
vector = TfidfVectorizer().fit(corpus)
matrix = vector.transform(corpus)

In [58]:
matrix = matrix.tocoo(copy=False)
matrix = pd.DataFrame({'position_level': matrix.row, 'vocab_index': matrix.col, 'score': matrix.data})

In [68]:
scores = matrix.groupby('vocab_index').apply(lambda group : np.average(group['position_level'], weights=group['score']))
scores.index = vector.get_feature_names()
scores.sort_values(ascending=False).head()

field_support_assist                5.0
laboratori_shift_leader             5.0
senior_room_attend                  5.0
temporari_crew                      5.0
charg_man_and_mainten_supervisor    5.0
dtype: float64

In [70]:
min_score = 0
max_score = 5

scores = scores.apply(lambda score : (max_score - score) / 5) # Invert and normalise

In [73]:
scores.loc['waiter']

0.17317793106486706

In [74]:
scores.to_csv('../data/seniority_scores.csv', encoding='utf-8')