## Основы предобработки данных

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

np.set_printoptions(precision=3)
pd.set_option('precision', 3)

sns.set()

In [None]:
# pip install seaborn

### Загрузка данных и предварительный анализа

Dibetes dataset https://www.kaggle.com/uciml/pima-indians-diabetes-database 

In [None]:
diabetes_df = pd.read_csv('diabetes.csv', index_col=None)
diabetes_df

In [None]:
diabetes_df.shape

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.info()

### Изучение и обработка количественных признаков

In [None]:
diabetes_df.describe()

In [None]:
diabetes_df.var()

In [None]:
diabetes_df[['Age','Glucose']].hist(figsize=(10, 4))

Histograms vs. Kernel Density Estimation <br>
https://mglerner.github.io/posts/histograms-and-kernel-density-estimation-kde-2.html?p=28 

In [None]:
diabetes_df[['Age','Glucose']].plot(kind='density', layout=(1, 2), subplots=True,
                                    figsize=(10, 4), sharex=False)

In [None]:
sns.distplot(diabetes_df['Age'])

### Обработка выбросов

In [None]:
sns.boxplot(x='Insulin', data=diabetes_df)

In [None]:
plt.subplots(figsize=(12,7))
sns.boxplot(data=diabetes_df)

In [None]:
diabetes_df['Insulin'] = diabetes_df.Insulin.clip(lower=diabetes_df.Insulin.quantile(0.05), 
                                                  upper=diabetes_df.Insulin.quantile(0.95))

In [None]:
plt.subplots(figsize=(12,7))
sns.boxplot(data=diabetes_df)

### Обработка пропущенных значений

#### Поиск пропусков

In [None]:
no_zero_columns = list (set(diabetes_df.columns) -  set(['Outcome','Pregnancies']))

In [None]:
(diabetes_df[no_zero_columns] == 0).sum()

In [None]:
diabetes_df.loc[:, no_zero_columns] = diabetes_df[no_zero_columns].replace(0, np.NaN)
diabetes_df.head()

In [None]:
diabetes_df.isnull().sum()

In [None]:
diabetes_df.describe()

In [None]:
plt.subplots(figsize=(12,7))
sns.boxplot(data=diabetes_df)

#### Работа с пропусками

In [None]:
# "Плохие" способы:

    # Удаляем строки
print (diabetes_df.dropna(how='any', axis=0).shape)

    # Удаляем столбцы
print (diabetes_df.dropna(how='any', axis=1).shape)

In [None]:
diabetes_df.isnull().mean()

In [None]:
# Удалить столбцы, в которых много пропущенных значение
diabetes_df.loc[:, diabetes_df.isnull().mean() < .25].shape

In [None]:
# Заполняем произвольным значением

diabetes_df.fillna(-1).head()

In [None]:
diabetes_df.fillna(diabetes_df.mean(), inplace=True)

# diabetes_df['Glucose'].fillna((diabetes['Glucose']).mode()[0], inplace=True)
# diabetes_df['BMI'].fillna((diabetes['BMI']).mean(), inplace=True)
# diabetes_df['Insulin'].fillna((diabetes['Insulin']).median(), inplace=True)

diabetes_df.isnull().sum()

### Дискретизация данных (binning)

In [None]:
# Equal-width partitioning

diabetes_df['Age_equal'] = pd.cut(diabetes_df['Age'], 5)
diabetes_df.head()

In [None]:
# Expert partitioning

bins = [0,12,19,35,60,100]
group_names = ['Child', 'Teenager', 'Young', 'Adult','Elderly']
diabetes_df['Age_expert'] = pd.cut(diabetes_df['Age'], bins, labels=group_names)
diabetes_df.head(5)

### Изучение категориальных и бинарных признаков

In [None]:
diabetes_df['Outcome'].value_counts()

In [None]:
diabetes_df['Age_equal'].value_counts()

In [None]:
diabetes_df['Age_expert'].value_counts()

In [None]:
diabetes_df['Outcome'].hist(figsize=(10, 4))

In [None]:
diabetes_df['Outcome'].value_counts().plot(kind='bar')

In [None]:
sns.countplot(x='Outcome', data=diabetes_df)

In [None]:
_, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

sns.countplot(x='Age_expert', data=diabetes_df, ax=axes[0])
sns.countplot(x='Age_equal', data=diabetes_df, ax=axes[1])

### Изчение взаимодействия признаков

#### Только количественные признаки

In [None]:
plt.scatter(diabetes_df['Glucose'], diabetes_df['Insulin'], color='red')
plt.title('Glucose and Insulin')
plt.xlabel('Glucose')
plt.ylabel('Insulin')

In [None]:
sns.jointplot(x='Glucose', y='Insulin', data=diabetes_df, kind='scatter', color='orange')


In [None]:
num_columns = list(set(diabetes_df.columns) -  set(['Outcome','Age_expert','Age_equal']))
sns.pairplot(diabetes_df[num_columns])

In [None]:
corr_matrix = diabetes_df[num_columns].corr()
corr_matrix

In [None]:
sns.heatmap(corr_matrix, annot=True, annot_kws={"size":12})

#### Количественные и качественные признаки

In [None]:
diabetes_df.pivot_table(['Glucose', 'Insulin'],['Outcome'], aggfunc='mean')

In [None]:
sns.boxplot(x='Outcome', y='Glucose', data=diabetes_df)

In [None]:
sns.lmplot('Glucose', 'Insulin', data=diabetes_df, hue='Outcome', fit_reg=False);

#### Только качественные признаки

In [None]:
pd.crosstab(diabetes_df['Age_equal'], diabetes_df['Outcome'], margins=True)

In [None]:
sns.countplot(x='Age_equal', hue='Outcome', data=diabetes_df)

### Нормализация данных

In [None]:
diabetes_df[['DiabetesPedigreeFunction','Insulin']].plot(kind='density', layout=(1, 2), sharex=False, figsize=(10, 4))

In [None]:
sns.boxplot(data=diabetes_df[['DiabetesPedigreeFunction', 'Insulin']])

In [None]:
std_scaler = StandardScaler().fit(diabetes_df[['DiabetesPedigreeFunction','Insulin']])

cols_std_scaled = std_scaler.transform(diabetes_df[['DiabetesPedigreeFunction','Insulin']])

minmax_scaler = MinMaxScaler().fit(diabetes_df[['DiabetesPedigreeFunction','Insulin']])

cols_minmax_scaled = minmax_scaler.transform(diabetes_df[['DiabetesPedigreeFunction','Insulin']])

type(cols_std_scaled)

In [None]:
print('Mean after standardization:\nDiabetesPedigreeFunction={:.2f}, Insulin={:.2f}'
      .format(cols_std_scaled[:,0].mean(), cols_std_scaled[:,1].mean()))
print('\nStandard deviation after standardization:\nGlucose={:.2f}, Insulin={:.2f}\n'
      .format(cols_std_scaled[:,0].std(), cols_std_scaled[:,1].std()))

print('Mean after min-max scaling:\nDiabetesPedigreeFunction={:.2f}, Insulin={:.2f}'
      .format(cols_minmax_scaled[:,0].mean(), cols_minmax_scaled[:,1].mean()))
print('\nStandard deviation after min-max scaling:\nDiabetesPedigreeFunction={:.2f}, Insulin={:.2f}'
      .format(cols_minmax_scaled[:,0].std(), cols_minmax_scaled[:,1].std()))

In [None]:
plt.figure(figsize=(8,6))
#plt.scatter(diabetes_df['DiabetesPedigreeFunction'], diabetes_df['Insulin'], color='orange', label='input scale', alpha=0.5)  
plt.scatter(cols_std_scaled[:,0], cols_std_scaled[:,1], color='red', label='Standardized', alpha=0.3)
plt.scatter(cols_minmax_scaled[:,0], cols_minmax_scaled[:,1],color='blue', label='Min-max scaling ', alpha=0.3)
plt.xlabel('DiabetesPedigreeFunction')
plt.ylabel('Insulin')
plt.legend(loc='upper left')


In [None]:
diabetes_df_scaled = pd.DataFrame(cols_std_scaled, columns = ['DiabetesPedigreeFunction', 'Insulin'])

diabetes_df_scaled[['DiabetesPedigreeFunction','Insulin']].plot(kind='density', layout=(1, 2), sharex=False, figsize=(10, 4))

In [None]:
sns.boxplot(data=diabetes_df_scaled[['DiabetesPedigreeFunction', 'Insulin']])