In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm as tn

# 数据清理

In [2]:
iris = pd.read_csv('datas/iris_dataset.csv')

In [3]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


# 将自然语言标签转换为计算机语言
### l2n stands for labels to numbers

<div style='background-color:#78e08f'>
    <br>
</div>    

```py
temp_x = iris.loc[:,'sepal.length':'petal.width'].values
temp_y = iris.loc[:,'variety']


def l2n(dataframe, label_column_name):
    
    df = dataframe.copy(deep = True)
    temp_label = df.loc[:,label_column_name]
    #去重
    
    type_name = temp_label.drop_duplicates(keep = 'first').values
    type_num = temp_label.drop_duplicates(keep = 'first').shape[0]
    
    #生成字典
    dic = {x:y for x,y in zip(type_name, range(type_num))}
    #使用字典映射一个新的column(dtype = int)
    df['F_label'] = df[label_column_name].map(dic)
    return df
```    

# 数据分割

```py
def split_data(features,labels,test_size):
    '''
    参数：(features,labels,测试集的大小)
    返回值是一个tuple: (train_x, train_y, test_x, test_y)
    
    '''
    #
    train_x = np.array([])
    train_y = np.array([])
    
    test_x = np.array([])
    test_y = np.array([])
    
    
    train_size = features.shape[0] - test_size
    
    for _ in range(test_size):
        a = np.random.randint(low = 0,high = features.shape[0])
        #print('random:' , a)
        #print('feature_index:',features.shape[0])
        #向测试集添加元素
        test_x = np.append(test_x, features[a])
        test_y = np.append(test_y, labels[a])
        #删除已向测试集添加过的元素
        features = np.delete(features, a, 0)
        labels = np.delete(labels, a, 0)
    
    train_x = features
    train_y = labels
    
    print('数据分割完成, 其中训练集: {}, 测试集: {}'.format(train_x.shape[0], test_x.shape[0]))
    return (train_x, train_y, test_x.reshape(30, 4), test_y)
```   

<div style='background-color:#78e08f'>
    <br>
</div>    

In [62]:
# pandas自动化数据清理，封装
def data_clean(dataframe, label, test_size):
    '''
    pandas automatic data cleanning
    
    dataframe: a pandas dataframe object
    label: label column name
    test_size: test_size
    
    transfer String label intro numbers, and 
    return a tuple ((train_x, train_y, test_x, test_y))
    
    '''
    #数据清理
    
    df = dataframe.copy(deep = True)
    temp_label = df.loc[:,label]
    #去重
    
    type_name = temp_label.drop_duplicates(keep = 'first').values
    type_num = temp_label.drop_duplicates(keep = 'first').shape[0]
    
    #生成字典
    dic = {x:y for x,y in zip(type_name, range(type_num))}
    #使用字典映射一个新的column(dtype = int)
    df['F_label'] = df[label].map(dic)
    
    
    #数据分割
    f_col = df.columns.drop(['F_label',label])
    features = df.loc[:,f_col].values
    labels = df.loc[:,'F_label'].values
    
    
    train_x = np.array([])
    train_y = np.array([])
    
    test_x = np.array([])
    test_y = np.array([])
    
    
    train_size = features.shape[0] - test_size
    
    for _ in range(test_size):
        a = np.random.randint(low = 0,high = features.shape[0])
        
        #向测试集添加元素
        test_x = np.append(test_x, features[a])
        test_y = np.append(test_y, labels[a])
        #删除已向测试集添加过的元素
        features = np.delete(features, a, 0)
        labels = np.delete(labels, a, 0)
    
    train_x = features
    train_y = labels
    
    print('数据分割完成, 其中训练集: {}, 测试集: {}'.format(train_y.shape[0], test_y.shape[0]))
    return (train_x, train_y.reshape(120,1), test_x.reshape(30, 4), test_y.reshape(30,1))

In [63]:
train_x,train_y,test_x,test_y = data_clean(iris,'variety',30)

数据分割完成, 其中训练集: 120, 测试集: 30


In [64]:
print(train_x[0:5])
print(train_y[0:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.4 3.9 1.7 0.4]]
[[0]
 [0]
 [0]
 [0]
 [0]]


In [7]:
print(train_x.shape)
print(train_y.shape)

(120, 4)
(120,)


# 激活函数和损失函数

In [8]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [9]:
def MSE(prediction, label):
    return np.mean((prediction-label)**2)

In [10]:
def delta_E(prediction, label):
    return 2 * (prediction - label)

# 训练模型

$ y = x * w + b$

In [38]:
w = np.random.rand(4,1)
b = np.random.random()
t = np.dot(train_x,w)
t2= t+b
t2.shape

(120, 1)

In [77]:
def train (x , y , w , b , times):
    for _ in tn(range(times)):
        z = np.dot(x,w) + b
        #print(z.shape)
        A = sigmoid(z)
        delta = delta_E(A,y)
        
        
        w = w - 1 * np.mean(delta)
        b = b - 1 * delta
    print(MSE(A,y))

In [78]:
train(train_x,train_y,w,b,100000)

100%|██████████| 100000/100000 [00:04<00:00, 22799.77it/s]

0.5228910950014451



