In [1]:
%%time
from main import *

raw_courses = read('data/courses.txt')
raw_industries = read('data/industries.txt')
raw_job_industries = read('data/job_industries.txt')

CPU times: user 30.6 ms, sys: 3.41 ms, total: 34 ms
Wall time: 33.3 ms


In [2]:
%%time
# note: enchant module and library should be installed
from clean import clean

clean_courses = clean(raw_courses, 'course', 'data/course_dictionary.txt')
clean_industries = clean(raw_industries, 'industry', 'data/industry_dictionary.txt')
clean_job_industries = clean(raw_job_industries, 'industry', 'data/industry_dictionary.txt')

CPU times: user 12.6 s, sys: 94.2 ms, total: 12.7 s
Wall time: 12.7 s


In [3]:
%%time
# note: stemming module should be installed
from clustering import stem_cluster
 
course_clusters = stem_cluster(clean_courses)
industry_clusters = stem_cluster(clean_industries)
job_industry_clusters = stem_cluster(clean_job_industries, mode=5)

CPU times: user 73.8 ms, sys: 93 µs, total: 73.9 ms
Wall time: 61 ms


In [4]:
sorted(course_clusters)[:10]

['ACCOUNTING',
 'ADMINISTRATION',
 'ADVERTISING',
 'AGRIBUSINESS',
 'AGRICULTURE',
 'ANIMATION',
 'APPLIED',
 'ARCHITECTURE',
 'ART',
 'ASSOCIATES']

In [5]:
sorted(industry_clusters)[:10]

['ACCOUNTING',
 'ADVERTISING',
 'AGENCY',
 'ASSISTANT',
 'AUTOMOTIVE',
 'BANKING',
 'BEAUTY',
 'BPO',
 'BROKERAGE',
 'BUILDING']

In [6]:
sorted(job_industry_clusters)[:10]

['BUSINESS',
 'CALLCENTER',
 'COMPUTER',
 'CONSTRUCTION',
 'DEPARTMENT',
 'DESIGN',
 'DEVELOPMENT',
 'DISTRIBUTION',
 'EDUCATION',
 'ENGINE']

In [7]:
%%time
# note: pandas and sklearn modules should be installed
# initializes a likelihood data frame based on row and column labels.
# Performs a content-based filtering technique using features.
# vectorizer can be 'tfidf' or 'count'
from likelihood import likelihood_matrix

CourseIndustry = likelihood_matrix(course_clusters, industry_clusters, vectorizer='tfidf', ngram_range=(3,4))
CourseIndustry.head()

CPU times: user 335 ms, sys: 36.2 ms, total: 371 ms
Wall time: 379 ms


In [8]:
%%time

# note: pandas and sklearn modules should be installed
# derives the change in likelihood if an entry

from likelihood import delta_likelihood

# computes delta likelihood using the following steps: (non-verbatim)
# clabels = df.column_labels
# rlabels = df.row_labels
# vr = [cos_similarity(ur, label) for label in rlabels]
# vc = [cos_similarity(vr, label) for label in clabels]
# dL = vr * vc^T

d1 = delta_likelihood(CourseIndustry, 'CALLCENTE', 'BUSINESS')
d2 = delta_likelihood(CourseIndustry, 'MARKETING', 'RESEARCH & DEVELOPMENT')
d3 = delta_likelihood(CourseIndustry, 'AGRICULTURE', 'FINANCE')

CPU times: user 10.4 ms, sys: 4.06 ms, total: 14.5 ms
Wall time: 14.2 ms


In [9]:
%%time

# dynamic update of likelihood matrix
# just accumulate small likelihoods
# accounts for relative likelihood and not absolute percentage

users = [
    ('COMPUTER SCIENCE', 'INFORMATION TECHNOLOGY'),
    ('ACCOUNTING', 'BUSINESS'),
    ('BUSINESS ADMINSTRATION', 'FINANCE'),
    ('MARKETING', 'FINANCE'),
    ('MATHEMATICS', 'FINANCE'),
    ('SOCIAL SCIENCE', 'CONSULATION'),
    ('LITERATURE', 'WRITING'),
    ('MANAGEMENT', 'BANKING'),
    ('SCIENCES', 'RESEARCH')
]

for course, industry in users:
    CourseIndustry += delta_likelihood(CourseIndustry, course, industry)

CPU times: user 37 ms, sys: 4.04 ms, total: 41 ms
Wall time: 39 ms


In [10]:
# get specific likelihood coefficient of a pair of clusters
# e.g. likelihood of courses from the BUSINESS industry

# note: 1.00 does not mean 100%
# likelihood metric is relative
# the more data, the more accurate the prediction

%time CourseIndustry.BUSINESS.nlargest(10)

CPU times: user 984 µs, sys: 0 ns, total: 984 µs
Wall time: 829 µs


BUSINESS        1.000000
ACCOUNTING      1.000000
AGRIBUSINESS    0.747282
WRITING         0.159128
DRAFTING        0.135685
MARKETING       0.124098
NURSING         0.117439
ADVERTISING     0.082212
MARINE          0.074211
MEDICINE        0.060449
Name: BUSINESS, dtype: float64

In [11]:
%%time

# transpose data frame for rows
CourseIndustry.transpose().SCIENCE.nlargest(6)

CPU times: user 1.73 ms, sys: 0 ns, total: 1.73 ms
Wall time: 1.51 ms


RESEARCH        0.945460
INFORMATION     0.627905
TECHNOLOGY      0.464770
CONSULTING      0.426784
EDUCATION       0.243047
CONSTRUCTION    0.239345
Name: SCIENCE, dtype: float64

In [21]:
%%time
# to get probability distribution of a column/row,
# just divide row by sum of its elements

CourseIndustry.TECHNOLOGY.multiply(100.0 /sum(CourseIndustry.TECHNOLOGY)).nlargest(20)

CPU times: user 1.13 ms, sys: 17 µs, total: 1.15 ms
Wall time: 1.07 ms


TECHNOLOGY           21.797430
COMPUTER             10.923265
SCIENCE              10.130794
CRIMINOLOGY           8.025756
TECHNICAL             7.731569
BIOLOGY               7.501088
TECHNICIAN            6.793365
THEOLOGY              6.596293
SOCIOLOGY             6.144819
PSYCHOLOGY            5.457806
MECHANIC              1.430608
ARCHITECTURE          1.116272
INTERDISCIPLINARY     0.930924
FINANCE               0.925809
COMMERCE              0.709077
MAINTENANCE           0.652396
COMMERCIAL            0.611325
THEATER               0.597610
COMMUNICATION         0.568613
CALLCENTER            0.471013
Name: TECHNOLOGY, dtype: float64

In [30]:
# You can also get the dot product of data frames
# to get bridged likelihood

IndustryJob = likelihood_matrix(industry_clusters, job_industry_clusters, vectorizer='tfidf', ngram_range=(3,4))

CPU times: user 18.2 ms, sys: 0 ns, total: 18.2 ms
Wall time: 16.9 ms


In [33]:
CourseIndustry.dot(IndustryJob).RETAIL.nlargest(20)

SECRETARIAL          0.258991
CALLCENTER           0.017504
ENVIRONMENTAL        0.015395
ELEMENTARY           0.010261
MANAGEMENT           0.010064
DEVELOPMENT          0.009229
INTERNATIONAL        0.007866
MAINTENANCE          0.006626
INTERDISCIPLINARY    0.006152
TRAINING             0.004875
ADVERTISING          0.004162
TRANSPORTATION       0.003814
THEATER              0.003257
ENTREPRENEURSHIP     0.003023
COMPUTER             0.002832
LITERATURE           0.002542
ANIMATION            0.000000
ENGINEER             0.000000
FINANCE              0.000000
ELECTRONIC           0.000000
Name: RETAIL, dtype: float64