In [3]:
%%time
from algoutils import *

raw_courses = read('data/courses.txt')
raw_industries = read('data/industries.txt')
raw_job_industries = read('data/job_industries.txt')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 6.95 ms


In [4]:
%%time
# note: enchant module and library should be installed
from cleaning import clean

clean_courses = clean(raw_courses, 'course', 'data/course_dictionary.txt')
clean_industries = clean(raw_industries, 'industry', 'data/industry_dictionary.txt')
clean_job_industries = clean(raw_job_industries, 'industry', 'data/industry_dictionary.txt')

CPU times: user 15.9 s, sys: 128 ms, total: 16 s
Wall time: 15.8 s


In [5]:
%%time
# note: nltk module and nltk.wordnet corpus should be installed
from clustering import stem_cluster
 
course_clusters = stem_cluster(clean_courses)
industry_clusters = stem_cluster(clean_industries)
job_industry_clusters = stem_cluster(clean_job_industries, mode=5)

CPU times: user 3.79 s, sys: 84 ms, total: 3.87 s
Wall time: 3.84 s


In [6]:
sorted(course_clusters)[:10]

['ACCOUNTANCY',
 'ACCOUNTING',
 'ADMINISTRATION',
 'ADVERTISING',
 'AGRIBUSINESS',
 'AGRICULTURAL',
 'AGRICULTURE',
 'ANIMATION',
 'APPLIED',
 'ARCHITECTURAL']

In [7]:
sorted(industry_clusters)[:10]

['ACCOUNTING',
 'ADVERTISING',
 'AGENCY',
 'ASSISTANT',
 'AUTOMOTIVE',
 'BANKING',
 'BEAUTY',
 'BPO',
 'BROKERAGE',
 'BUILDING']

In [8]:
sorted(job_industry_clusters)[:10]

['BUSINESS',
 'CALLCENTER',
 'COMPUTER',
 'CONSTRUCTION',
 'DEPARTMENT',
 'DESIGN',
 'DEVELOPMENT',
 'DISTRIBUTION',
 'EDUCATION',
 'ENGINE']

In [9]:
%%time

import analysis
reload(analysis)
from analysis import *

# note: pandas and sklearn modules should be installed
# initializes a likelihood data frame based on row and column labels.
# Performs a content-based filtering technique using features.
# vectorizer can be 'tfidf' or 'count'

CourseIndustry = LikelihoodMatrix(course_clusters, industry_clusters, vectorizer='tfidf', ngram_range=(3,4))

CPU times: user 244 ms, sys: 32 ms, total: 276 ms
Wall time: 281 ms


In [10]:
CourseIndustry.dataframe.head()

Unnamed: 0,ENGINE,REPAIR,HARDWARE,PRODUCT,HOSPITALITY,LINE,CAFE,CUSTOMER,OFFICE,POWER,...,DEVELOPMENT,CITY,PLANT,PUBLISHING,SALES,FABRICATION,HEALTH,ASSISTANT,DISTRIBUTOR,BROKERAGE
LINGUISTICS,0,0,0,0,0,0.111282,0,0,0,0,...,0.0,0.0,0,0.020568,0,0,0,0.045183,0.042346,0
STUDIES,0,0,0,0,0,0.0,0,0,0,0,...,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0
HARDWARE,0,0,1,0,0,0.0,0,0,0,0,...,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0
HOSPITALITY,0,0,0,0,1,0.0,0,0,0,0,...,0.0,0.108406,0,0.0,0,0,0,0.0,0.0,0
ELEMENTARY,0,0,0,0,0,0.0,0,0,0,0,...,0.112116,0.0,0,0.0,0,0,0,0.0,0.0,0


In [11]:
%%time

# note: pandas and sklearn modules should be installed
# derives the change in likelihood if an entry

# computes delta likelihood using the following steps: (non-verbatim)
# clabels = df.column_labels
# rlabels = df.row_labels
# vr = [cos_similarity(ur, label) for label in rlabels]
# vc = [cos_similarity(vr, label) for label in clabels]
# dL = vr * vc^T

d1 = CourseIndustry.delta('CALLCENTE', 'BUSINESS')
d2 = CourseIndustry.delta('MARKETING', 'RESEARCH & DEVELOPMENT')
d3 = CourseIndustry.delta('AGRICULTURE', 'FINANCE')

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 8.41 ms


In [12]:
%%time

# dynamic update of likelihood matrix
# just accumulate small likelihoods
# accounts for relative likelihood and not absolute percentage

users = [
    ('COMPUTER SCIENCE', 'INFORMATION TECHNOLOGY'),
    ('ACCOUNTING', 'BUSINESS'),
    ('BUSINESS ADMINSTRATION', 'FINANCE'),
    ('MARKETING', 'FINANCE'),
    ('MATHEMATICS', 'FINANCE'),
    ('SOCIAL SCIENCE', 'CONSULATION'),
    ('LITERATURE', 'WRITING'),
    ('MANAGEMENT', 'BANKING'),
    ('SCIENCES', 'RESEARCH')
]

for course, industry in users:
    CourseIndustry.add_match(course, industry)

CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 26.2 ms


In [13]:
# get specific likelihood coefficient of a pair of clusters
# e.g. likelihood of courses from the BUSINESS industry

# note: 1.00 does not mean 100%
# likelihood metric is relative
# the more data, the more accurate the prediction

%time CourseIndustry.dataframe.BUSINESS.nlargest(10)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 790 µs


BUSINESS        1.000000
ACCOUNTING      1.000000
AGRIBUSINESS    0.751409
ACCOUNTANCY     0.592792
WRITING         0.164709
DRAFTING        0.140500
MARKETING       0.128460
NURSING         0.111017
ADVERTISING     0.077272
MARINE          0.072866
Name: BUSINESS, dtype: float64

In [14]:
%%time

# transpose data frame for rows
CourseIndustry.dataframe.T.SCIENCE.nlargest(6)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 988 µs


RESEARCH       0.903873
INFORMATION    0.608181
TECHNOLOGY     0.446791
CONSULTING     0.401750
CONSULTANCY    0.360313
EDUCATION      0.247390
Name: SCIENCE, dtype: float64

In [15]:
%%time

# add count from a specific matching pair
# note: this is synonymous to CourseIndustry.dataframe += CourseIndustry.delta('PHYSICS', 'REPAIR SUPPLY')

CourseIndustry.add_match('PHYSICS', 'RESEARCH')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.58 ms


In [16]:
%%time

# returns a vector of recommend likelihood of a course from a specific industry
# since courses are rows, we find column matches
# we can also make the method output percentages

matches = CourseIndustry.find_column_matches('PHYSICS', with_labels = True, percentage = True)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 436 µs


In [17]:
# rank recommendations with the highest percentage
matches.nlargest(5)

RESEARCH       55.490816
MEDICAL         5.642393
ELECTRICAL      5.304976
ELECTRONICS     4.927573
FINANCE         4.836570
dtype: float64

In [18]:
%%time

# we can also get the recommendation score of a specific pair
# this is equivalent to SV1^T * LM * SV2^T
# where SV is similarity_vector, LM is likelihood matrix

CourseIndustry.recommendation_score('PHYSICS', 'SCIENTIFIC RESEARCH')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.25 ms


56.11440517434135

In [19]:
%%time

# You can also get the dot product of data frames
# to get bridged likelihood

IndustryJob = LikelihoodMatrix(industry_clusters, job_industry_clusters, vectorizer='tfidf', ngram_range=(3,4))
CourseJob = CourseIndustry * IndustryJob

CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 77.1 ms


In [20]:
CourseJob.recommendation_score('PHYSICS', 'MANAGEMENT')

0.29094734615360518

In [21]:
CourseJob.recommendation_score('COMPUTER SCIENCE', 'ARCHITECTURE')

2.0380836694137852

In [22]:
CourseJob.recommendation_score('COMPUTER SCIENCE', 'TECHNOLOGY')

37.74019144084599

In [23]:
# after adding a match to course and a job, recommendation score should go up
CourseJob.add_match('COMPUTER SCIENCE', 'TECHNOLOGY')
CourseJob.add_match('PROGRAMMING', 'COMPUTER')

In [24]:
CourseJob.recommendation_score('PROGRAMMING', 'COMPUTER')

68.850745229040243