In [31]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [32]:
# 读入数据
car = pd.read_csv("D:/input/car.csv")
# 打印数据集的前五行
car.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [33]:
# 查看数据集大小
car.shape

(1728, 7)

In [34]:
# 查看各个取值的数量分布
print(car["maint"].value_counts())

vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64


In [35]:
# 查看各个取值的数量分布
print(car["safety"].value_counts())

low     576
med     576
high    576
Name: safety, dtype: int64


In [36]:
# 查看各个取值的数量分布
print(car["acceptance"].value_counts())

unacc    1210
acc       384
good       69
vgood      65
Name: acceptance, dtype: int64


In [37]:
# 查看各个取值的数量分布
print(car["lug_boot"].value_counts())

small    576
med      576
big      576
Name: lug_boot, dtype: int64


In [38]:
# 查看各个取值的数量分布
print(car["persons"].value_counts())

2       576
4       576
more    576
Name: persons, dtype: int64


In [39]:
# 查看各个取值的数量分布
print(car["buying"].value_counts())

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64


In [40]:
# 创建要进行数字编码的数据集
car_num_encoded = car.copy()
# 根据编码规则创建编码字典
acceptance_dict = {"unacc": 0, "acc": 1, "good": 2, "vgood": 3}
# 进行编码
car_num_encoded["acceptance"] = car_num_encoded["acceptance"].map(acceptance_dict)

In [41]:
# 查看编码列的前五行
car_num_encoded["acceptance"].head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: acceptance, dtype: int64

In [42]:
# 创建要进行数字编码的数据集
car_num_encoded = car.copy()
# 根据编码规则创建编码字典
buying_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
# 进行编码
car_num_encoded["buying"] = car_num_encoded["buying"].map(buying_dict)
# 查看编码列的前五行
car_num_encoded["buying"].head(10)

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
Name: buying, dtype: int64

In [43]:
# maint特征数字编码
maint_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
car_num_encoded["maint"] = car_num_encoded["maint"].map(maint_dict)

# lug_boot特征数字编码
lug_boot_dict = {"small": 0, "med": 1, "big": 2}
car_num_encoded["lug_boot"] = car_num_encoded["lug_boot"].map(lug_boot_dict)

# safety特征数字编码
safety_dict = {"low": 0, "med": 1, "high": 2}
car_num_encoded["safety"] = car_num_encoded["safety"].map(safety_dict)

In [44]:
car_num_encoded.head(20)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,3,3,2,2,0,0,unacc
1,3,3,2,2,0,1,unacc
2,3,3,2,2,0,2,unacc
3,3,3,2,2,1,0,unacc
4,3,3,2,2,1,1,unacc
5,3,3,2,2,1,2,unacc
6,3,3,2,2,2,0,unacc
7,3,3,2,2,2,1,unacc
8,3,3,2,2,2,2,unacc
9,3,3,2,4,0,0,unacc


In [45]:
# 将车门数量一列中，5more替换为6
car_num_encoded.doors.replace("5more", "6", inplace=True)
# 将核载人数一列中，more替换为6
car_num_encoded.persons.replace("more", "6", inplace=True)

In [46]:
print(car_num_encoded["doors"].value_counts())

print(car_num_encoded["persons"].value_counts())

2    432
3    432
4    432
6    432
Name: doors, dtype: int64
2    576
4    576
6    576
Name: persons, dtype: int64


In [47]:
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

object object


In [48]:
# 将车门数量属性类型变更为整型
car_num_encoded["doors"] = car_num_encoded["doors"].astype("int")
# 将核载人数属性类型变更为整型
car_num_encoded["persons"] = car_num_encoded["persons"].astype("int")

In [49]:
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

int32 int32


In [50]:
car_num_encoded.head(20)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,3,3,2,2,0,0,unacc
1,3,3,2,2,0,1,unacc
2,3,3,2,2,0,2,unacc
3,3,3,2,2,1,0,unacc
4,3,3,2,2,1,1,unacc
5,3,3,2,2,1,2,unacc
6,3,3,2,2,2,0,unacc
7,3,3,2,2,2,1,unacc
8,3,3,2,2,2,2,unacc
9,3,3,2,4,0,0,unacc


In [51]:
# 创建编码器
buying_encoder = LabelEncoder()
# 适配数据集并完成编码
encoded_buying = buying_encoder.fit_transform(car["buying"])

In [52]:
buying_encoder.inverse_transform([0, 1, 2, 3])

array(['high', 'low', 'med', 'vhigh'], dtype=object)

In [54]:
pd.get_dummies(car["buying"], prefix="buying_").head(10)

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
5,0,0,0,1
6,0,0,0,1
7,0,0,0,1
8,0,0,0,1
9,0,0,0,1


In [55]:
# 新建dataframe存储编码后的数据
car_onehot_encoded = pd.DataFrame()
# 对各个特征依次编码
for col in ["buying", "maint", "doors", "persons", "lug_boot", "safety"]:
    col_encoded = pd.get_dummies(car[col], prefix=col + "_")
    car_onehot_encoded = pd.concat([car_onehot_encoded, col_encoded], axis=1)

In [56]:
car_onehot_encoded = pd.concat([car_onehot_encoded, car["acceptance"]], axis=1)

In [57]:
car_onehot_encoded.head(20)

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh,maint__high,maint__low,maint__med,maint__vhigh,doors__2,doors__3,...,persons__2,persons__4,persons__more,lug_boot__big,lug_boot__med,lug_boot__small,safety__high,safety__low,safety__med,acceptance
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,unacc
1,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,unacc
2,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,unacc
3,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,unacc
4,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,unacc
5,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,1,0,0,unacc
6,0,0,0,1,0,0,0,1,1,0,...,1,0,0,1,0,0,0,1,0,unacc
7,0,0,0,1,0,0,0,1,1,0,...,1,0,0,1,0,0,0,0,1,unacc
8,0,0,0,1,0,0,0,1,1,0,...,1,0,0,1,0,0,1,0,0,unacc
9,0,0,0,1,0,0,0,1,1,0,...,0,1,0,0,0,1,0,1,0,unacc


In [58]:
car_onehot_encoded.to_csv("D:/output/one-hot.csv")