In [51]:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

filename = 'adult.data.txt'
x_data = []
less_than_50k = 0
more_than_50k = 0
unsure = 0

with open(filename,'r') as f:
    for line in f:
        if '?' in line:
            unsure += 1
            continue
        data = line.strip().split(', ')
        x_data.append(data)
#         print(data[-1])
        if data[-1] == '<=50K':
            less_than_50k += 1
        else:
            more_than_50k += 1

            
print('unsure:',unsure)
print('less_than_50k:',less_than_50k)
print('more_than_50k:',more_than_50k)
        

unsure: 2399
less_than_50k: 22654
more_than_50k: 7509


AttributeError: 'list' object has no attribute 'shape'

如果大部分数据点都属于一个类型，那么分类器就会倾向于这个类型。     
因此最好使用每个类型数据点相等的数据进行训练。     
对上述程序进行部分改动

In [54]:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
import numpy as np

filename = 'adult.data.txt'
x_data = []
less_than_50k = 0
more_than_50k = 0
unsure = 0

with open(filename,'r') as f:
    for line in f:
        if '?' in line:
            unsure += 1
            continue
        data = line.strip().split(', ')
        if data[-1] == '<=50K' and less_than_50k < 10000:
            x_data.append(data)
            less_than_50k += 1
        elif data[-1] == '>50K' and more_than_50k < 10000: 
            x_data.append(data)
            more_than_50k += 1
        if less_than_50k >= 10000 and more_than_50k >= 10000:
            break

            
print('unsure:',unsure)
print('less_than_50k:',less_than_50k)
print('more_than_50k:',more_than_50k)

x_data = np.array(x_data)


unsure: 2399
less_than_50k: 10000
more_than_50k: 7508
['State-gov' 'Self-emp-not-inc' 'Private' ... 'Private' 'Private'
 'Self-emp-inc']


对于这个数据集，有14个影响收入的属性，属性既包括字符串也包括数字。     
数值数据是有价值的，这种情况下，不能使用编码器进行编码，需要设计一套既可以处理数值数据，也可以处理非数值数据的系统。     
我们需要把字符串属性转换成数值数据，同时需要保留原来的数值数据

In [55]:
label_encoder = []
x_encoded = np.empty(x_data.shape)
for i,item in enumerate(x_data[0]):
    if item.isdigit():
        x_encoded[:,i] = x_data[:,i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        x_encoded[:,i] = label_encoder[-1].fit_transform(x_data[:,i])
        
x_values = x_encoded[:,:-1].astype(int)
y_values = x_encoded[:,-1].astype(int)
print(y_values)

[0 0 0 ... 1 1 1]


In [41]:
# 建立分类器
classifier = GaussianNB()

In [42]:
# 交叉验证
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_values,y_values,test_size = 0.25,random_state = 5)
classifier.fit(x_train,y_train)
y_test_pred = classifier.predict(x_test)

In [43]:
# 计算分类器的F1得分
from sklearn.model_selection import cross_val_score
f1 = cross_val_score(classifier,x_values,y_values,scoring='f1_weighted',cv=5)
print('F1:',round(100*f1.mean(),2),'%')

F1: 74.98 %


In [57]:
# 对单一数据点进行编码测试
input_data = ['39', 'State-gov', '77516', 'Bachelors', '13', 
              'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States'] 

input_data_encoded = [-1]*len(input_data)
count = 0

for i,item in enumerate(input_data):
    contain = []
    contain.append(item)
    if item.isdigit():
        input_data_encoded[i] = int(item) # 注意，如果不化为int型则会报错
    else:    
        input_data_encoded[i] = int(label_encoder[count].transform(contain)) 
        count += 1
        
input_data_encoded = np.array(input_data_encoded).reshape(1,-1)

output_class = classifier.predict(input_data_encoded)
print('Output class:',label_encoder[-1].inverse_transform(output_class))

Output class: ['>50K']
