In [3]:
import sklearn.preprocessing
import sklearn.ensemble
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data/成人收入数据集-筛选后.csv', index_col=0)

In [5]:
gender_encoder = sklearn.preprocessing.LabelEncoder().fit(df['性别'])
education_encoder = sklearn.preprocessing.OneHotEncoder().fit(np.array(df['学历']).reshape(-1, 1))
work_encoder = sklearn.preprocessing.LabelEncoder().fit(df['职业'])

In [6]:
def process_gender(gender_array):
    return gender_encoder.transform(gender_array)

def process_gender_once(gender):
    return gender_encoder.transform(np.array([gender]))[0]

process_gender_once('男')

1

In [7]:
def process_education(education_array):
    return education_encoder.transform(np.array(education_array).reshape(-1, 1)).toarray()

def process_education_once(education):
    return education_encoder.transform(np.array([education]).reshape(-1, 1)).toarray()[0]

process_education_once('本科')

array([0., 0., 1., 0., 0.])

In [8]:
def process_work(work_array):
    return work_encoder.transform(work_array)

process_work(df['职业'])

array([0, 0, 1, ..., 0, 0, 0])

In [9]:
def pre_process(height, weight, gender, age,
                education, education_time, working_hours):
    return np.concatenate((np.array(height).reshape(-1,1),
                           np.array(weight).reshape(-1,1),
                           process_gender(gender).reshape(-1,1),
                           np.array(age).reshape(-1,1),
                           process_education(education),
                           np.array(education_time).reshape(-1,1),
                           np.array(working_hours).reshape(-1,1)), axis=1)

pre_process(df['身高'],df['体重'],df['性别'],df['年龄'],
            df['学历'],df['受教育时间(年)'],df['每周工作时长'])

array([[156.92,  45.58,   1.  , ...,   0.  ,  13.  ,  40.  ],
       [151.45,  65.1 ,   1.  , ...,   0.  ,  13.  ,  13.  ],
       [166.05,  52.68,   1.  , ...,   1.  ,   9.  ,  40.  ],
       ...,
       [150.46,  42.72,   0.  , ...,   1.  ,   9.  ,  40.  ],
       [156.35,  74.07,   1.  , ...,   1.  ,   9.  ,  20.  ],
       [159.44,  77.18,   0.  , ...,   1.  ,   9.  ,  40.  ]])

In [10]:
def pre_process_once(height, weight, gender, age,
                     education, education_time, working_hours):
    return np.array([height, weight, process_gender_once(gender), age]\
           + process_education_once(education).tolist()\
           + [education_time, working_hours])

pre_process_once(172,65,'男',19,'本科',12,40)

array([172.,  65.,   1.,  19.,   0.,   0.,   1.,   0.,   0.,  12.,  40.])

In [11]:
class Model:
    def __init__(self):
        self.data = pre_process(df['身高'],df['体重'],df['性别'],df['年龄'],
            df['学历'],df['受教育时间(年)'],df['每周工作时长'])
        self.target = process_work(df['职业'])
        self.train_x, self.test_x, self.train_y, self.test_y = sklearn.model_selection.train_test_split(self.data, self.target)
        self.classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=17,
                                                                  max_depth=6)

    def fit(self):
        self.classifier.fit(self.train_x, self.train_y)

    def predict(self, height, weight, gender, age, education, education_time, working_hours):
        return self.classifier.predict([pre_process_once(height, weight, gender, age, education, education_time, working_hours)])

In [12]:
model = Model()
model.classifier.fit(model.train_x, model.train_y)

RandomForestClassifier(max_depth=6, n_estimators=17)

In [17]:
np.bincount(model.classifier.predict(model.test_x))

array([6323, 1808,   10], dtype=int64)

In [25]:
model.classifier.score(model.train_x, model.train_y)

0.5811629811629812

In [26]:
model.classifier.score(model.test_x, model.test_y)

0.5756049625353151

In [31]:
import sklearn.naive_bayes
model.classifier.fit(model.train_x, model.train_y)
model.classifier.score(model.test_x, model.test_y)


0.5709372312983663