# 数据预处理
## Scaling

In [None]:
# 读取数据
import pandas as pd
teenager_sns = pd.read_csv('./data/teenager_sns.csv') # dataframe类型

In [None]:
# z-score
from sklearn import preprocessing
teenager_sns_age = teenager_sns['age']
teenager_sns_zscore = preprocessing.scale(X=teenager_sns_age, with_mean=teenager_sns_age.mean(), with_std=teenager_sns_age.std())

# 0-1 scaling
teenager_sns_age_01 = preprocessing.minmax_scale(teenager_sns_age)

# Decimal scaling
k = 1
teenager_sns_age_decimal = teenager_sns_age/10**k

# Logistic scaling
import math
teenager_sns_age_logistic = 1/(1+math.e**(-teenager_sns_age))

## Unsupervised Discretization

In [None]:
# 导入数据
from sklearn import datasets
boston = datasets.load_boston()
boston_data = boston['data']
boston_feature_names = boston['feature_names']
boston_df = pd.DataFrame(data=boston_data,columns=boston_feature_names)

In [None]:
boston_LSTAT = boston_df['LSTAT']
[boston_LSTAT_cut,bins] = pd.cut(x=boston_LSTAT, bins=4, labels=['a','b','c','d'], retbins=True)
boston_LSTAT_cut.hist()

In [None]:
[boston_LSTAT_frequency, bins_fre] = pd.qcut(x=boston_LSTAT, q=4, labels=range(4), retbins=True)

In [None]:
# k-means
from scipy.cluster.vq import kmeans
kmeans(obs=boston_LSTAT, k_or_guess=4)

In [None]:
u = teenager_sns_age.mean()
std = teenager_sns_age.std()
bins_3 = [u+x*std for x in range(-3,4)]
teenager_sns_age_3std = pd.cut(teenager_sns_age,bins=bins_3)

## Data Redundancy 数据冗余

In [None]:
import pandas as pd
import numpy as np
data_df = pd.read_csv('./data/teenager_sns.csv')

In [None]:
data_df_age = data_df['age']
data_df_gradyear = data_df['gradyear']
data_df_age

In [None]:
data_df_pearson = data_df.corr()

In [None]:
import seaborn as sns
sns.heatmap(data_df_pearson.iloc[:5,:5], annot=True)

In [None]:
sns.pairplot(data_df.iloc[:,:5])

## Missing Data

In [None]:
import numpy as np
data_df_age_isnan = np.isnan(data_df_age) # 判断数据段是否为空，不是空则false
np.isnan(data_df_age).any()
np.isnan(data_df_age).all()

data_df_age.value_counts()

In [None]:
# axis=0（默认），删除行；axis=1删除列
data_df_age.dropna() # 删除有缺失值的行
data_df.dropna(axis=1) # 删除有缺失值的列

In [None]:
data_df_age_fill1 = data_df_age.fillna(0) # 将所有缺失值填充为0
data_df_age_fill1.value_counts()

data_df_age_fill2 = data_df_age.fillna(data_df_age.mean())
data_df_age_fill1.value_counts()

## Filling by Interpolation

In [None]:
import numpy as np
import pylab as pl
from scipy import interpolate

# create data
x = np.linspace(0,10,11)
y = np.sin(x)
pl.plot(x,y)

# line interpolation
func_line = interpolate.interp1d(x=x, y=y)
x_new = np.linspace(0,10,101)
y_new = func_line(x_new)
pl.plot(x_new, y_new)

# Lagrange interplation
func_line2 = interpolate.lagrange(x=x, w=y)
y_new2 = func_line2(x_new)
pl.plot(x_new, y_new2)

## Outliers Detection - Statistics Based Methods

In [None]:
data_df_age.isna().value_counts()

In [None]:
data_df_age.fillna('unknown').value_counts()
data_df_age.replace(np.nan, 'unknown').value_counts()

In [None]:
data_df['sports'].hist()
data_df['sports'].plot(kind='kde', secondary_y=True)

In [None]:
import scipy
scipy.stats.anderson(data_df['sports'], dist='norm')

In [None]:
data_df.boxplot(column=['friends'])

## Metric

In [None]:
from sklearn.neighbors import LocalOutlierFactor as lof
X = [[1],[2],[3],[4],[90]]
model = lof(n_neighbors=2, metric='minkowski') # 训练模型
model.fit_predict(X) # 预测异常值
model.negative_outlier_factor_ # 负值异常值