In [1792]:
import numpy as np
import pandas as pd

In [1793]:
n_data = 99

In [1794]:
df = pd.read_csv('recruit.csv', usecols=range(0,20), nrows=n_data).sample(frac=1)

In [1795]:
df.columns = ['面试日期', '姓名', '目前所在公司', '学校', '专业', '学历', '工作年限', '应聘职位', \
 '职位细分', '专业技能', '面试形式', '面试官', \
 '面试结果', '手机', '邮箱', '详细内容', '沟通记录', '渠道', '具体来源', '对接HR']

In [1796]:
from sklearn import linear_model

In [1797]:
raw_labels = df[['面试结果']]

In [1798]:
drop_list = ['面试结果','姓名','手机','面试日期','详细内容','邮箱','沟通记录']

In [1799]:
raw_features = df.drop(drop_list, 1)

In [1800]:
features = [[] for i in range(n_data)]

In [1801]:
labels = [0 for i in range(n_data)]

In [1802]:
column_names = []

In [1803]:
column_values = []

In [1804]:
delimiter = {'学校':'/', \
             '目前所在公司':'/', \
             '专业':'/', \
             '学历':'/', \
             '面试官':'，', \
             '专业技能':'，' \
            }

In [1805]:
for col in raw_features.columns:
    col_list = []
    if col in delimiter.keys():
        for val in raw_features[col]:
            if pd.isnull(val):
                col_list.append(val)
                continue
            for v in val.split(delimiter[col]):
                col_list.append(v.strip())
    else:
        col_list = list(raw_features[col])
    col_set = set(col_list)
    col_set_len = len(col_set)
    col_map = dict(zip(col_set, range(col_set_len))) #map N to feature names
    column_names.extend([col]*col_set_len)
    column_values.extend(col_set)
    for i in range(n_data):
        features[i].extend([0]*col_set_len) #pad with zeros
        features[i][-(col_set_len-col_map[col_list[i]])] = 1 #set one
        
        

In [1806]:
for i in range(len(raw_labels['面试结果'])):
    if raw_labels['面试结果'][i] in ('通过', '录用'):
        labels[i] = 1

In [1807]:
train_size = 80

In [1808]:
model = linear_model.LogisticRegression(penalty="l2", class_weight=None)

In [1809]:
model.fit(features[:train_size], labels[:train_size])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [1810]:
coef = list(zip(column_names, column_values, model.coef_.flatten()))

In [1811]:
coef.sort(key=lambda x:x[-1], reverse=True)

In [1812]:
from sklearn import metrics

In [1813]:
predict = model.predict(features[train_size:])

In [1814]:
metrics.roc_auc_score(labels[train_size:], predict)

0.46875

In [1815]:
metrics.accuracy_score(labels[train_size:], predict)

0.78947368421052633

In [1816]:
metrics.recall_score(labels[train_size:], predict)

0.0

In [1817]:
predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [1818]:
model.predict_proba(features[train_size:])

array([[ 0.91131748,  0.08868252],
       [ 0.93441483,  0.06558517],
       [ 0.9522944 ,  0.0477056 ],
       [ 0.89442443,  0.10557557],
       [ 0.84395883,  0.15604117],
       [ 0.94173759,  0.05826241],
       [ 0.88486393,  0.11513607],
       [ 0.89836641,  0.10163359],
       [ 0.77421442,  0.22578558],
       [ 0.90848425,  0.09151575],
       [ 0.85626642,  0.14373358],
       [ 0.97773655,  0.02226345],
       [ 0.92361037,  0.07638963],
       [ 0.72227665,  0.27772335],
       [ 0.84966573,  0.15033427],
       [ 0.9315639 ,  0.0684361 ],
       [ 0.88111304,  0.11888696],
       [ 0.88486471,  0.11513529],
       [ 0.48457353,  0.51542647]])

In [1819]:
coef

[('面试官', 'teng.ren', 0.65801863327971788),
 ('专业', '统计学', 0.64440234003574814),
 ('渠道', '自投', 0.56705861960927972),
 ('目前所在公司', '百姓网', 0.51521371526843451),
 ('学校', '中国科学与技术', 0.51521371526843451),
 ('专业', '计算机软件与理论', 0.44307170525036327),
 ('应聘职位', '算法工程师', 0.43602166757117256),
 ('学校', '西安电子科技大学', 0.43377905385753407),
 ('学校', '黄山学院', 0.42311349131158904),
 ('专业', '机器学习', 0.42311349131158904),
 ('具体来源', '拉勾网', 0.42311349131158904),
 ('目前所在公司', '腾讯', 0.42171834326579588),
 ('学校', '华威大学', 0.42171834326579588),
 ('专业技能', '数据平台开发', 0.42171834326579588),
 ('具体来源', 'Topu', 0.42171834326579588),
 ('目前所在公司', '易迅', 0.40789168064032066),
 ('具体来源', '埃摩森', 0.40789168064032066),
 ('目前所在公司', '久振网络', 0.40367566174917952),
 ('学校', '清华大学', 0.40367566174917952),
 ('专业', '经济', 0.40367566174917952),
 ('应聘职位', '数据分析师', 0.38761285089895126),
 ('职位细分', '数据挖掘', 0.38455066568902657),
 ('面试形式', nan, 0.37724579991111951),
 ('专业', '软件工程', 0.37559257922891598),
 ('目前所在公司', '京东', 0.36354849910660569),
 ('目前所在公司',

In [1820]:
from sklearn.svm import SVC

In [1821]:
svm = SVC(class_weight=None, kernel='poly')
svm.fit(features[:train_size], labels[:train_size])
svm_predict = svm.predict(features[train_size:])

In [1822]:
metrics.roc_auc_score(labels[train_size:], svm_predict)

0.5

In [1823]:
metrics.accuracy_score(labels[train_size:], svm_predict)

0.84210526315789469

In [1824]:
metrics.recall_score(labels[train_size:], svm_predict)

0.0

In [1825]:
svm_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [1826]:
labels

[1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0]