# 葡萄酒分类数据集-数据处理Demo
参考：https://blog.csdn.net/u012735708/article/details/84000262

In [39]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression   #逻辑斯特回归，线性分类
from sklearn.linear_model import SGDClassifier        #随机梯度参数估计
from sklearn.svm import LinearSVC                     #支持向量机
from sklearn.naive_bayes import MultinomialNB         #朴素贝叶斯
from sklearn.neighbors import KNeighborsClassifier    #K近邻
from sklearn.tree import DecisionTreeClassifier       #决策树
from sklearn.ensemble import RandomForestClassifier   #随机森林
from sklearn.ensemble import GradientBoostingClassifier   #梯度提升决策树
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
 
from sklearn.preprocessing import MinMaxScaler   #最大最小归一化
from sklearn.preprocessing import StandardScaler   #标准化
from scipy.stats import pearsonr                    #皮尔森相关系数
from sklearn.model_selection import train_test_split     #划分数据集
from sklearn.model_selection import cross_val_score   
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
#计算排列和组合数所需要的包
from itertools import combinations
from scipy.special import comb


## 1、数据分析
当我们拿到一批原始的数据，首先要明确基本信息。（利用pandas、matplotlib.pyplot、seaborn）  

例如：样本数，特征维度，特征类型，各类别分布占比  
- 看数据：data.shape、data.head()、train.describe()  
- 看特征：data.info()、  
- 看类别分布是否均衡：data['category'].value_counts()  

In [3]:
import pandas as pd
columns=['0Alcohol','1Malic acid ','2Ash','3Alcalinity of ash',
         '4Magnesium','5Total phenols','6Flavanoid',
         '7Nonflavanoid phenols','8Proanthocyanins ','9Color intensity ','10Hue ','11OD280/OD315 of diluted wines' ,'12Proline ','13category']
data= pd.read_csv("wine.csv",header=None,names=columns)

In [4]:
data.shape 

# (178, 14) 一共178个样本，13个特征列，1个标签列

(178, 14)

In [5]:
data.head(5)

Unnamed: 0,0Alcohol,1Malic acid,2Ash,3Alcalinity of ash,4Magnesium,5Total phenols,6Flavanoid,7Nonflavanoid phenols,8Proanthocyanins,9Color intensity,10Hue,11OD280/OD315 of diluted wines,12Proline,13category
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [6]:
data.info() #查看数据概述

# 特征列类型：: float64(11), int64(2)
# 标签列：int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
0Alcohol                          178 non-null float64
1Malic acid                       178 non-null float64
2Ash                              178 non-null float64
3Alcalinity of ash                178 non-null float64
4Magnesium                        178 non-null int64
5Total phenols                    178 non-null float64
6Flavanoid                        178 non-null float64
7Nonflavanoid phenols             178 non-null float64
8Proanthocyanins                  178 non-null float64
9Color intensity                  178 non-null float64
10Hue                             178 non-null float64
11OD280/OD315 of diluted wines    178 non-null float64
12Proline                         178 non-null int64
13category                        178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.5 KB


In [11]:
data.dtypes #查看数据类型

0Alcohol                          float64
1Malic acid                       float64
2Ash                              float64
3Alcalinity of ash                float64
4Magnesium                          int64
5Total phenols                    float64
6Flavanoid                        float64
7Nonflavanoid phenols             float64
8Proanthocyanins                  float64
9Color intensity                  float64
10Hue                             float64
11OD280/OD315 of diluted wines    float64
12Proline                           int64
13category                          int64
dtype: object

In [8]:
data.describe() #查看数据概述

# 可以得到：
# 1、各特征没有缺失值
# 2、数值型特征的量纲不同
# 3、方差变化也很大

Unnamed: 0,0Alcohol,1Malic acid,2Ash,3Alcalinity of ash,4Magnesium,5Total phenols,6Flavanoid,7Nonflavanoid phenols,8Proanthocyanins,9Color intensity,10Hue,11OD280/OD315 of diluted wines,12Proline,13category
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,1.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,1.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,1.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,2.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,3.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,3.0


In [9]:
data['13category'].value_counts() #查看这一列的值统计

# 3个类别，分别为59、71、48，样本较为均衡

2    71
1    59
3    48
Name: 13category, dtype: int64

In [13]:
data['13category'].unique() #查看数据取值有哪些

array([1, 2, 3], dtype=int64)

In [None]:
data.corr() #相关系数矩阵，即给出了任意两款菜式之间的相关系数
# 不过数据量很大时会很慢，所以特征维度高时就别用了

In [14]:
data.isnull().sum()  #查看每一列缺失值情况

0Alcohol                          0
1Malic acid                       0
2Ash                              0
3Alcalinity of ash                0
4Magnesium                        0
5Total phenols                    0
6Flavanoid                        0
7Nonflavanoid phenols             0
8Proanthocyanins                  0
9Color intensity                  0
10Hue                             0
11OD280/OD315 of diluted wines    0
12Proline                         0
13category                        0
dtype: int64

In [15]:
data.isnull().sum(axis=1) #查看每一行缺失值情况

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
148    0
149    0
150    0
151    0
152    0
153    0
154    0
155    0
156    0
157    0
158    0
159    0
160    0
161    0
162    0
163    0
164    0
165    0
166    0
167    0
168    0
169    0
170    0
171    0
172    0
173    0
174    0
175    0
176    0
177    0
Length: 178, dtype: int64

In [22]:
# 截取特征和标签
features = data.loc[:, '0Alcohol':'12Proline ']
labels = data['13category']

## 2、数据预处理

1） 数据类型分类处理转换  
2） 缺失值处理  
3） 异常值处理  
4） 无量纲化（规范化）  

### 缺失值统计与处理：

In [12]:
# 缺失数查看
data.isnull().sum()  #查看每一列缺失值情况
data.isnull().sum().sort_values(ascending=False)
# 缺失值填充
data = data.fillna(data.median()) 
# 根据每列进行填充
for col in data.columns:
    data[col] = data[col].fillna(data[col].median())

# 统计缺失率超过90%的特征
train = data
isnull_cols = []
feature_selected=[]
for col in train.columns:
    if train[col].isnull().sum()/train[col].count() >= 0.9:
        isnull_cols.append(col)
    else:
        feature_selected.append(col)
print('There exists ', len(isnull_cols), ' columns with much null value.')
# 去除缺失率高的列
data = data[feature_selected]


# 统计缺失值超过90%的特征列,封装成函数
# 把特征个数少的列全部丢弃,使用和不适用效果影响不大，建议使用，省时间内存
def null_ratio(df):
    features=df.columns
    feature_selected=[]
    drop_index=[]
    sz=df.size
    for feat in features:
        sz_null=df[df[feat].isnull()].size
        ratio=float(sz_null)/sz
        if ratio > 0.9:
            drop_index.append(feat) 
        else:
            feature_selected.append((feat,ratio))
    return feature_selected,drop_index

0Alcohol                          0
1Malic acid                       0
2Ash                              0
3Alcalinity of ash                0
4Magnesium                        0
5Total phenols                    0
6Flavanoid                        0
7Nonflavanoid phenols             0
8Proanthocyanins                  0
9Color intensity                  0
10Hue                             0
11OD280/OD315 of diluted wines    0
12Proline                         0
13category                        0
dtype: int64

### 根据数据类型分别处理：
1字符串特征：label编码  
2数值特征：数据变换，基于多项式、指数函数、对数函数等  
3低基数类别特征（定性特征）：哑编码 和 one-hot编码  
4高基数类别特征：先降维，再编码  
5定量特征：二值化  
6时间特征：日期季度时间等划分（参考：https://blog.csdn.net/JR_lu/article/details/52987573?locationNum=3&fps=1）  

In [None]:
# 1 连续特征离散化
df['pv_bins']=pd.cut(df['item_pv_level'],bins=[0,5,10,15,20]).astype('str')
df['pv_bins']=LabelEncoder().fit_transform(df['pv_bins'])

# 2 连续特征二值化
# 对定量特征二值化:设定一个阈值，大于阈值的赋值为1，小于等于阈值的赋值为0
from sklearn.preprocessing import Binarizer
# 参考一：
df['item_pv_level']=Binarizer(threshold=10).fit_transform(df['item_pv_level'].values.reshape(-1,1))
# 参考二：
features_new = Binarizer(threshold=3).fit_transform(features)

# 3 字符串特征label编码
for feature in features.columns:
    data[feature] = LabelEncoder().fit_transform(data[feature])

# 4 类别特征哑编码
# 参考一：
# 若不是数值表示，先用LabelEncoder对离散特征编码，因为onehotencoder只能处理数值
# 然后使用OneHotEncoder编码，生成稀疏表示的特征
# 再使用sparse.hstack连接连续特征和稀疏特征
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
data[feature] = LabelEncoder().fit_transform(data[feature].values)
enc = OneHotEncoder()
enc.fit(data[feature].values.reshape(-1, 1))
train_a=enc.transform(data[feature].values.reshape(-1, 1))
data_new= sparse.hstack((data, train_a))

# 参考二：用pd.get_dummies()实现，（直接生成的稠密矩阵，内存开销很大）
# 将离散的特征属性派生为多列只有0和1的属性
data=pd.get_dummies(df,columns=['user_gender_id'],dummy_na=True)
# 封装成函数：
def dumyuserfeature(train):
    train_copy = train.copy()
    for i ,col in enumerate(train.columns): # enumerate将其组成一个索引序列，利用它可以同时获得索引和值
        cofe = len(train.groupby(col).count()) # 看看这一维度中有多少不相同的值
        if cofe < 20: #10,15都一样
            feikong = np.sum([train[col] != -999] )
            if feikong < len(train) * 0.1:
                continue
            # join：将两个DataFrame中的不同的列索引合并成为一个DataFrame
            # pd.get_dummies : 如果DataFrame的某一列中含有k个不同的值，则可以派生出一个k列矩阵或DataFrame（其值全为1和0）
            train_copy = train_copy.join(pd.get_dummies(train[col], prefix=col+'_'))
    return train_copy

# 5 数据变换
# 5.1 基于多项式变换（对行变量处理）
features_new = preprocessing.PolynomialFeatures().fit_transform(features)
# 5.2 基于自定义函数变换，以log函数为例
features_new = preprocessing.FunctionTransformer(np.log1p).fit_transform(features)

### 规范化处理：（一般在划分数据集之后再标准化或归一化）
（注意标准化和归一化的区别：标准化针对列，归一化针对行向量）  
- 归一化是将每个样本的所有特征转换到同一量纲下，把所有特征数据映射到[0,1]或者[-1, 1]区间内  
- 标准化是依照特征矩阵的列处理数据，其通过求z-score的方法，转换为标准正态分布，和整体样本分布相关，每个样本点都能对标准化产生影响。

In [None]:
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler
scaler=StandardScaler() #0均值，单位方差
scaler=MinMaxScaler(feature_range=(0, 1)) #变换到[0,1]区间（也可以是其他固定最小最大值的区间）
scaler=Normalizer(norm='l2') # 'l1', 'l2', or 'max', optional ('l2' by default)

# print(np.mean(features, axis=0))
# print(np.std(features, axis=0))
# 1.标准化：将服从正态分布的特征值转换成标准正态分布（对列向量处理）
features_new = StandardScaler().fit_transform(features)
# 2 区间缩放：将特征值缩放到[0, 1]区间的数据（对列向量处理）
features_new = MinMaxScaler().fit_transform(features)
    
# 3 归一化：将行向量转化为“单位向量”（对每个样本处理）
features_new = Normalizer().fit_transform(features)

### 其它

In [None]:
# 删除方差较小的列(方差较小的特征含有的信息较少,更多的是噪音)
train_des = train.describe()
low_variance_cols = []
for col in train_des.columns:
    if train_des.loc['std',col]< 1e-5:
        low_variance_cols.append(col)
print('There exists ', len(low_variance_cols), ' columns with low std.')

# 存储
train.to_csv('./data_deal/train_deal_1.csv',index = None)
test.to_csv('./data_deal/test.csv',index = None)

## 3、划分数据集 + 模型

若数据样本较好，可以先不做特征工程，直接带模型看看效果  
若预处理后数据样本维度较大或相关性较强，就先做特征工程，再带模型

1）训练集和测试集划分  
2）LR模型  
3）SVM模型  
4）最近邻分类模型  
5）决策树  
6）随机森林  
7）XGBoost  
8）LightBGM  

In [40]:

#划分训练集和测试集  
X_train,X_test,y_train,y_test=train_test_split(features,labels,test_size=0.2,random_state=0) 
#此处采用最大最小归一化， 可以换成StandardScaler()归一化方法,如果用StandardScaler()方法的话，则不能使用MultinomialNB()模型
ss=MinMaxScaler()
#ss=StandardScaler()                           
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)


In [46]:
from sklearn import metrics
#模型及模型参数列表
clfs = [LogisticRegression(),SGDClassifier(),LinearSVC(),MultinomialNB(),KNeighborsClassifier(),\
        DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),GaussianNB(),ExtraTreesClassifier()]
#输出模型及参数信息，以及模型分类准确性
for model in clfs:
        print("==============================")
        print("模型及模型参数：")   
        print(str(model))
        model.fit(X_train,y_train)
        print("模型准确率：")
        print(model.score(X_test,y_test))

模型及模型参数：
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
模型准确率：
0.972222222222
模型及模型参数：
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
模型准确率：
1.0
模型及模型参数：
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
模型准确率：
1.0
模型及模型参数：
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
模型准确率：
0.944444444444
模型及模型参数：
KNeighborsClassifier(algorithm='auto', leaf_size=30, m

In [57]:
# LR
from sklearn import linear_model
from sklearn.metrics import classification_report
cls=linear_model.LogisticRegression()
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
模型评估：
             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.94      0.97        16
          3       0.86      1.00      0.92         6

avg / total       0.98      0.97      0.97        36



In [59]:
# SVM模型  
from sklearn.svm import SVR,SVC

cls=SVC(probability=True,kernel='rbf',C=0.1,max_iter=10)
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=10, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
模型评估：
             precision    recall  f1-score   support

          1       1.00      0.79      0.88        14
          2       0.84      1.00      0.91        16
          3       1.00      1.00      1.00         6

avg / total       0.93      0.92      0.92        36





In [60]:
# K近邻分类模型  

cls=KNeighborsClassifier()
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
模型评估：
             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.94      0.97        16
          3       0.86      1.00      0.92         6

avg / total       0.98      0.97      0.97        36



In [68]:
# 决策树  
from sklearn.tree import DecisionTreeClassifier
cls=DecisionTreeClassifier()
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
模型评估：
             precision    recall  f1-score   support

          1       0.93      1.00      0.97        14
          2       1.00      0.94      0.97        16
          3       1.00      1.00      1.00         6

avg / total       0.97      0.97      0.97        36



In [67]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier()
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
模型评估：
             precision    recall  f1-score   support

          1       0.93      1.00      0.97        14
          2       1.00      0.94      0.97        16
          3       1.00      1.00      1.00         6

avg / total       0.97      0.97      0.97        36



In [69]:
# GBDT
from sklearn.ensemble import GradientBoostingClassifier
cls=GradientBoostingClassifier()
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
模型评估：
             precision    recall  f1-score   support

          1       0.93      1.00      0.97        14
          2       1.00      0.88      0.93        16
          3       0.86      1.00      0.92         6

avg / total       0.95      0.94      0.94        36



In [65]:
# XGBoost
from xgboost.sklearn import XGBClassifier
cls=XGBClassifier()
cls.fit(X_train,y_train)
y_pred=cls.predict(X_test)

print("模型及模型参数：")   
print(str(cls))
print("模型评估：")
print(classification_report(y_test, y_pred))

模型及模型参数：
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
模型评估：
             precision    recall  f1-score   support

          1       0.93      1.00      0.97        14
          2       1.00      0.88      0.93        16
          3       0.86      1.00      0.92         6

avg / total       0.95      0.94      0.94        36



## 4、特征工程（特征选择+降维）

## 5、模型调参

## 6、模型融合