In [3]:
from abelian import db
from abelian.models import ProfilerAnswer, Profiler, User, Job

db.configure('postgresql://kalibrr:password@localhost/kalibrr', False)
session = db.get_session()
session.rollback()

In [7]:
def extract_fields_of_study(education_profiler_data):
    fields_of_study = []
    
    for epd in education_profiler_data:
        if len(epd) > 1:
            print(epd)
        data = epd['education']
        education_dict = {key:data.get(key, '') for key in ['school', 'fields_of_study', 'level']}

        field_of_study = education_dict['fields_of_study']

        if field_of_study:
            fields_of_study += field_of_study

    return fields_of_study

uid_industry_data = {x[0]: x[1] for x in filter(lambda x: not x[1]['no_work_experience'], session.query(User.id, ProfilerAnswer.data).filter(ProfilerAnswer.user_id==User.id, User.is_test==False, ProfilerAnswer.profiler_code==Profiler.code, Profiler.type=='WorkHistoryProfiler').all())}
uid_course_data = {x[0]: x[1] for x in filter(lambda x: x[1]['education'].get('fields_of_study', None), session.query(User.id, ProfilerAnswer.data).filter(ProfilerAnswer.user_id==User.id, User.is_test==False, ProfilerAnswer.profiler_code==Profiler.code, Profiler.type=='EducationProfiler').all())}

In [5]:
uid_industry_data

{2825: {u'no_work_experience': False,
  u'work_history': [{u'company_name': u'Jollibee Food Corporation',
    u'end_date': u'2013-03-04T16:00:00.000Z',
    u'industry': None,
    u'job_desc': u'Fryman',
    u'job_title': u'Service Crew',
    u'start_date': u'2010-03-09T16:00:00.000Z',
    u'still_work_here': None,
    u'type': 100}]},
 2837: {u'no_work_experience': False,
  u'work_history': [{u'company_name': u'Pingu Tech Inc.',
    u'end_date': None,
    u'industry': None,
    u'job_desc': u'I motivate my coworkers by dressing up in an oversized penguin suit.',
    u'job_title': u'Mascot',
    u'start_date': u'2013-12-31T16:00:00.000Z',
    u'still_work_here': True,
    u'type': 100}]},
 2854: {u'no_work_experience': False,
  u'work_history': [{u'company_name': u'Kalibrr',
    u'end_date': u'2015-02-28T16:00:00.000Z',
    u'industry': u'Operations & Management',
    u'job_desc': u'Work with the design, engineering, business development, customer success, and marketing team to identify

In [18]:
def extract_fields_of_study(data):
    fields_map = {}
    for ID, educ in data.items():
        fields_map[ID] = ' / '.join(educ['education']['fields_of_study'])
    return fields_map

course_data = extract_fields_of_study(uid_course_data)

In [25]:
def extract_work_history(data, key='job_title'):
    def extract_single(map_list):
        return map(lambda item: item[key],filter(lambda item: key in item and item[key], map_list))
    fields_map = {}
    for ID, mapper in data.items():
        fields_map[ID] = ' / '.join(extract_single(mapper['work_history']))
    return fields_map

job_title_data = extract_work_history(uid_industry_data, 'job_title')

In [26]:
def merge_dicts(a, b):
    res = {}
    for key in a:
        if key in b:
            res[key] = (a[key], b[key])
    for key in b:
        if key in a:
            res[key] = (a[key], b[key])
    return res

course_job_title_data = merge_dicts(course_data, job_title_data).values()

In [29]:
def encode(data):
    return [s.encode('ascii', 'ignore') if isinstance(s, unicode) or isinstance(s, str) else '' for s in data]

courses, job_titles = zip(*course_job_title_data)
courses = encode(courses)
job_titles = encode(job_titles)

In [30]:
from cleaning import clean

clean_courses = clean(courses, style='course')
clean_job_titles = clean(job_titles, style='job title')

In [33]:
clean_data = zip(clean_courses, clean_job_titles)

In [34]:
clean_data

[('COMMERCE / MANAGEMENT', 'INVENTORY OFFICER'),
 ('HOTEL RESTAURANT MANAGEMENT', 'CREW MEMBER / FOOD SERVER'),
 ('APPLIED SOCIOLOGY',
  'PROJECT MANAGER / RESEARCH ASSOCIATE / TECHNICAL STAFF & COMMUNICATIONS OFFICER / RESEARCH ASSOCIATE & CONSULTANT / RESEARCH ASSOCIATE / TECHNICAL ASSISTANT / TECHNICAL ASSISTANT'),
 ('ACCOUNTING SCHOLARSHIP', 'ACCOUNTS PAYABLE STAFF'),
 ('TOURISM GEOGRAPHY', 'BARISTA / CUSTOMER SERVICE REPRESENTATIVE'),
 ('BUSINESS ENTREPRENEURSHIP',
  'CUSTOMER SERVICE REPRESENTATIVE / TECHNICAL SUPPORT REPRESENTATIVE'),
 ('BUSINESS ADMINISTRATION',
  'SALES COORDINATOR / BILLING CLERK / CALL CENTER AGENT'),
 ('CHEMICAL ENGINEERING',
  'PROCESS DESIGN ENGINEER / CONTRACTUAL PROCESS DESIGN ENGINEER'),
 ('NURSING',
  'STAFF NURSE / ICU & SURGICAL STAFF NURSE / CUSTOMER SERVICE REPRESENTATIVE / QUALITY ASSURANCE ANALYST'),
 ('COMPUTER SCIENCES', 'HEAD INFORMATION TECHNOLOGY'),
 ('OFFICE MANAGEMENT',
  'PHARMACY ASSISTANT & CASHIER / ACCOUNTING STAFF & CASHIER'),
 ('AU

In [35]:
from clustering import stem_cluster

course_clusters = stem_cluster(clean_courses, mode=8)
job_title_clusters = stem_cluster(clean_job_titles, mode=8, length_at_least=4)

In [37]:
from analysis import LikelihoodMatrix

CourseJob = LikelihoodMatrix(course_clusters, job_title_clusters)

In [40]:
%%time
%%timeit

for course, job in zip(clean_courses, clean_job_titles):
    CourseJob.add_match(course, job)

KeyboardInterrupt: 

In [41]:
# find which jobs are prominent in the computer science course
CourseJob.find_column_matches('COMPUTER SCIENCE').nlargest(15)

REPRESENTATIVE     3.132575
REPRESENTATIVES    2.957152
SERVICE            2.188538
CUSTOMER           1.985622
SERVICES           1.871663
ASSISTANT          1.860379
TECHNICAL          1.562450
TECHNICIAN         1.376424
ASSISTANCE         1.360648
MANAGER            1.311552
DEVELOPER          1.280839
CASHIER            1.279195
OFFICER            1.176158
OPERATOR           1.173683
SPECIALIST         1.171439
dtype: float64

In [42]:
# find which jobs are prominent in the business administration course
CourseJob.find_column_matches('BUSINESS ADMINISTRATION').nlargest(15)

REPRESENTATIVE     3.747742
REPRESENTATIVES    3.489811
SERVICE            2.504137
CUSTOMER           2.481831
ASSISTANT          2.471142
SERVICES           2.005840
MANAGER            1.941704
ASSISTANCE         1.881207
OFFICER            1.594930
OFFICE             1.472586
INTERN             1.302661
SALES              1.249814
ACCOUNT            1.218282
CUSTOMS            1.215310
ACCOUNTING         1.179339
dtype: float64

In [43]:
# find which courses are prominent in nurse jobs
CourseJob.find_row_matches('NURSE').nlargest(15)

NURSING       82.375061
OBSTETRICS     1.710928
MIDWIFERY      1.674018
BIOLOGY        1.348360
BIOLOGICAL     0.820021
SOCIOLOGY      0.744801
HEALTHCARE     0.617392
ECOLOGY        0.588521
GEOLOGY        0.578818
TECHNICIAN     0.521239
THEOLOGY       0.507302
RADIOLOGIC     0.494877
EDUCATION      0.474361
PATHOLOGY      0.456814
TECHNOLOGY     0.447961
dtype: float64

In [44]:
# find which jobs are prominent in the sales agent jobs
CourseJob.find_row_matches('SALES AGENT').nlargest(15)

TECHNOLOGY        4.865088
BUSINESS          4.723356
COMPUTER          4.426041
ADMINISTRATION    4.173285
INFORMATION       3.756388
MANAGEMENT        3.564001
AGRIBUSINESS      3.549196
INFORMATICS       3.158802
EDUCATION         3.011539
EDUCATIONAL       2.837268
COMPUTING         2.757552
SCIENCE           2.734391
SCIENCES          2.424937
ENGINEERING       2.254137
ENGINEER          1.850891
dtype: float64