In [2]:
%%time
from main import *

raw_courses = read('data/courses.txt')
raw_industries = read('data/industries.txt')
raw_job_industries = read('data/job_industries.txt')

ImportError: cannot import name utils

In [None]:
%%time
# note: enchant module and library should be installed
from clean import clean

clean_courses = clean(raw_courses, 'course', 'data/course_dictionary.txt')
clean_industries = clean(raw_industries, 'industry', 'data/industry_dictionary.txt')
clean_job_industries = clean(raw_job_industries, 'industry', 'data/industry_dictionary.txt')

In [None]:
%%time
# note: stemming module should be installed
from clustering import stem_cluster
 
course_clusters = stem_cluster(clean_courses)
industry_clusters = stem_cluster(clean_industries)
job_industry_clusters = stem_cluster(clean_job_industries, mode=5)

In [None]:
sorted(course_clusters)[:10]

In [None]:
sorted(industry_clusters)[:10]

In [None]:
sorted(job_industry_clusters)[:10]

In [None]:
%%time
# note: pandas and sklearn modules should be installed
# initializes a likelihood data frame based on row and column labels.
# Performs a content-based filtering technique using features.
# vectorizer can be 'tfidf' or 'count'
from likelihood import likelihood_matrix

CourseIndustry = likelihood_matrix(course_clusters, industry_clusters, vectorizer='tfidf', ngram_range=(3,4))
CourseIndustry.head()

In [None]:
%%time

# note: pandas and sklearn modules should be installed
# derives the change in likelihood if an entry

from likelihood import delta_likelihood

# computes delta likelihood using the following steps: (non-verbatim)
# clabels = df.column_labels
# rlabels = df.row_labels
# vr = [cos_similarity(ur, label) for label in rlabels]
# vc = [cos_similarity(vr, label) for label in clabels]
# dL = vr * vc^T

d1 = delta_likelihood(CourseIndustry, 'CALLCENTE', 'BUSINESS')
d2 = delta_likelihood(CourseIndustry, 'MARKETING', 'RESEARCH & DEVELOPMENT')
d3 = delta_likelihood(CourseIndustry, 'AGRICULTURE', 'FINANCE')

In [None]:
%%time

# dynamic update of likelihood matrix
# just accumulate small likelihoods
# accounts for relative likelihood and not absolute percentage

users = [
    ('COMPUTER SCIENCE', 'INFORMATION TECHNOLOGY'),
    ('ACCOUNTING', 'BUSINESS'),
    ('BUSINESS ADMINSTRATION', 'FINANCE'),
    ('MARKETING', 'FINANCE'),
    ('MATHEMATICS', 'FINANCE'),
    ('SOCIAL SCIENCE', 'CONSULATION'),
    ('LITERATURE', 'WRITING'),
    ('MANAGEMENT', 'BANKING'),
    ('SCIENCES', 'RESEARCH')
]

for course, industry in users:
    CourseIndustry += delta_likelihood(CourseIndustry, course, industry)

In [None]:
# get specific likelihood coefficient of a pair of clusters
# e.g. likelihood of courses from the BUSINESS industry

# note: 1.00 does not mean 100%
# likelihood metric is relative
# the more data, the more accurate the prediction

%time CourseIndustry.BUSINESS.nlargest(10)

In [None]:
%%time

# transpose data frame for rows
CourseIndustry.transpose().SCIENCE.nlargest(6)

In [None]:
%%time
# to get probability distribution of a column/row,
# just divide row by sum of its elements

CourseIndustry.TECHNOLOGY.multiply(100.0 /sum(CourseIndustry.TECHNOLOGY)).nlargest(20)

In [None]:
# You can also get the dot product of data frames
# to get bridged likelihood

IndustryJob = likelihood_matrix(industry_clusters, job_industry_clusters, vectorizer='tfidf', ngram_range=(3,4))

In [None]:
CourseIndustry.dot(IndustryJob).RETAIL.nlargest(20)