# 4. Data Preparation

    เพื่อเตรียมข้อมูลให้พร้อมก่อนการนำไปวิเคราะห์ และสร้างโมเดล มีหลายเทคนิค เช่น Cleaning, Selecting, Transforming

                                                                                        Data Science ง่าย . . by New ew ew

In [None]:
!pip install Image

In [None]:
!pip install scikit-learn

## 1) Data Quality

แก้ไขปัญหาที่อาจเกิดจากข้อมูลที่คุณภาพไม่ดีต่างๆ เช่น Noise, Outliers, Missing values, Duplicate data

In [None]:
import pandas as pd
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses','Class']

data = data.drop(['Sample code'],axis=1)
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

In [None]:
pd.set_option("max_rows", None)
data

In [None]:
# Missing values
import numpy as np

data = data.replace('?',np.NaN)

print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))

print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))

In [None]:
data2 = data['Bare Nuclei']

print('Before replacing missing values:')
print(data2[20:25])
data2 = data2.fillna(data2.median())

print('\nAfter replacing missing values:')
print(data2[20:25])

In [None]:
print('Number of rows in original data = %d' % (data.shape[0]))

data2 = data.dropna()
print('Number of rows after discarding missing values = %d' % (data2.shape[0]))

In [None]:
# Outliers
%matplotlib inline

data2 = data.drop(['Class'],axis=1)
data2['Bare Nuclei'] = pd.to_numeric(data2['Bare Nuclei'])
data2.boxplot(figsize=(20,3))

In [None]:
Z = (data2-data2.mean())/data2.std()
Z[20:25]

In [None]:
print('Number of rows before discarding outliers = %d' % (Z.shape[0]))

Z2 = Z.loc[((Z > -3).sum(axis=1)==9) & ((Z <= 3).sum(axis=1)==9),:]
print('Number of rows after discarding missing values = %d' % (Z2.shape[0]))
print(Z2)

In [None]:
# Duplicate Data
dups = data.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
data.loc[[11,28]]

In [None]:
print('Number of rows before discarding duplicates = %d' % (data.shape[0]))
data2 = data.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % (data2.shape[0]))

## 2) Aggregation

เป็นการรวมข้อมูลหลายๆ ข้อมูลให้เหลือข้อมูลเดียว เพื่อ 1) ลดขนาดของข้อมูล 2) ปรับมุมมองการวิเคราะห์ข้อมูล 3) เพิ่มความเสถียรของข้อมูล

In [None]:
# Daily precipitation time series data for weather station located at Detroit Metro Airport
daily = pd.read_csv('04_DTW_prec.csv', header='infer')
daily.index = pd.to_datetime(daily['DATE'])
daily = daily['PRCP']
ax = daily.plot(kind='line',figsize=(15,3))
ax.set_title('Daily Precipitation (variance = %.4f)' % (daily.var()))

In [None]:
monthly = daily.groupby(pd.Grouper(freq='M')).sum()
ax = monthly.plot(kind='line',figsize=(15,3))
ax.set_title('Monthly Precipitation (variance = %.4f)' % (monthly.var()))

In [None]:
annual = daily.groupby(pd.Grouper(freq='Y')).sum()
ax = annual.plot(kind='line',figsize=(15,3))
ax.set_title('Annual Precipitation (variance = %.4f)' % (annual.var()))

## 3) Sampling

In [None]:
data.head()

In [None]:
sample = data.sample(n=3)
sample

In [None]:
sample = data.sample(frac=0.01, random_state=1)
sample

In [None]:
sample = data.sample(frac=0.01, replace=True, random_state=1)
sample

## 4) Discretization

In [None]:
data['Clump Thickness'].hist(bins=10)
data['Clump Thickness'].value_counts(sort=False)

In [None]:
bins = pd.cut(data['Clump Thickness'],4)
bins.value_counts(sort=False)

In [None]:
bins = pd.qcut(data['Clump Thickness'],4)
bins.value_counts(sort=False)

## 5) Principal Component Analysis

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import PIL

numImages = 16
fig = plt.figure(figsize=(7,7))
imgData = np.zeros(shape=(numImages,36963))

for i in range(1,numImages+1):
    filename = '04_pics/Picture'+str(i)+'.jpg'
    img = mpimg.imread(filename)
    ax = fig.add_subplot(4,4,i)
    plt.imshow(img)
    plt.axis('off')
    ax.set_title(str(i))
    imgData[i-1] = np.array(img.flatten()).reshape(1,img.shape[0]*img.shape[1]*img.shape[2])

In [None]:
img

In [None]:
img.flatten()

In [None]:
imgData

In [None]:
len(imgData)

In [None]:
print(imgData.shape, type(imgData))

In [None]:
print(imgData[5,:])

In [None]:
import pandas as pd
from sklearn.decomposition import PCA

numComponents = 2
pca = PCA(n_components=numComponents)
pca.fit(imgData)

projected = pca.transform(imgData)
projected = pd.DataFrame(projected,columns=['pc1','pc2'],index=range(1,numImages+1))
projected['food'] = ['burger', 'burger','burger','burger','drink','drink','drink','drink',
                      'pasta', 'pasta', 'pasta', 'pasta', 'chicken', 'chicken', 'chicken', 'chicken']
projected

In [None]:
import matplotlib.pyplot as plt

colors = {'burger':'b', 'drink':'r', 'pasta':'g', 'chicken':'k'}
markerTypes = {'burger':'+', 'drink':'x', 'pasta':'o', 'chicken':'s'}

for foodType in markerTypes:
    d = projected[projected['food']==foodType]
    plt.scatter(d['pc1'],d['pc2'],c=colors[foodType],s=60,marker=markerTypes[foodType])

## สรุปเนื้อหา

1. Data Quality : Noise, Outliers, Missing values, Duplicate data

2. Aggregation : Combine data 1) Reduce data size 2) Changning analysis 3) Stability of data

3. Sampling : for exploratory data analysis and scale to big data applications, uncertainties

4. Discretization : Transform continuous value to category

5. Principal Component Analysis : Reducing number of attributes in the data