## Dealing with missing values

In [None]:
#Dropping rows or columns with 70% missing values
threshold = 0.7
#Dropping columns with missing value rate higher than threshold
data = data[data.columns[data.isnull().mean() < threshold]]

#Dropping rows with missing value rate higher than threshold
data = data.loc[data.isnull().mean(axis=1) < threshold]


##Numerical missing value handling
#Filling all missing values with 0
data = data.fillna(0)
#Filling missing values with medians of the columns
data = data.fillna(data.median())

##Categorical missing value handling
##imputing a category like “Other” might be more sensible, because in such a case, your imputation is likely to converge a random selection.
#Max fill function for categorical columns
data['column_name'].fillna(data['column_name'].value_counts()
.idxmax(), inplace=True)

## Handling Outliers

In [None]:
#Dropping the outlier rows with standard deviation
factor = 3
upper_lim = data['column'].mean () + data['column'].std () * factor
lower_lim = data['column'].mean () - data['column'].std () * factor

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

#z-score can be used instead of the formula above. Z-score (or standard score) standardizes the distance between a value and the mean using the standard deviation.

#Dropping the outlier rows with Percentiles
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

#Capping the outlier rows with Percentiles
#capping can affect the distribution of the data, thus it better not to exaggerate it.
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)
data.loc[(df[column] > upper_lim),column] = upper_lim
data.loc[(df[column] < lower_lim),column] = lower_lim

#Discover outliers with visualization tools 
# Box plot
import seaborn as sns
sns.boxplot(x=boston_df['DIS'])

# Scatter Plots
fig, ax = plt.subplots(figsize=(16,8))
ax.scatter(boston_df['INDUS'], boston_df['TAX'])
ax.set_xlabel('Proportion of non-retail business acres per town')
ax.set_ylabel('Full-value property-tax rate per $10,000')
plt.show()

# z-score
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(boston_df))
threshold = 3
print(np.where(z > 3))

# IQR
Q1 = boston_df_o1.quantile(0.25)
Q3 = boston_df_o1.quantile(0.75)
IQR = Q3 - Q1
print(boston_df_o1 < (Q1 - 1.5 * IQR)) |(boston_df_o1 > (Q3 + 1.5 * IQR))

# For removing OT using z-score and IQR, above represents the calculations 
boston_df_o = boston_df_o[(z < 3).all(axis=1)]

boston_df_out = boston_df_o1[~((boston_df_o1 < (Q1 - 1.5 * IQR)) |(boston_df_o1 > (Q3 + 1.5 * IQR))).any(axis=1)]
boston_df_out.shape


## BINNING CATEGORICAL & NUMERICAL

- The main motivation of binning is to make the model more robust and prevent overfitting, however, it has a cost to the performance. 
- The trade-off between performance and overfitting is the key point of the binning process. 
-  for numerical columns, except for some obvious overfitting cases, binning might be redundant for some kind of algorithms, due to its effect on model performance.
- for categorical columns, the labels with low frequencies probably affect the robustness of statistical models negatively. Thus, assigning a general category to these less frequent values helps to keep the robustness of the model. 

In [None]:
#Numerical Binning Example
data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Low", "Mid", "High"])

#Categorical Binning Example
conditions = [
    data['Country'].str.contains('Spain'),
    data['Country'].str.contains('Italy'),
    data['Country'].str.contains('Chile'),
    data['Country'].str.contains('Brazil')]

choices = ['Europe', 'Europe', 'South America', 'South America']

data['Continent'] = np.select(conditions, choices, default='Other')

## Log Transform 

- The data you apply log transform must have only positive values, otherwise you receive an error. Also, you can add 1 to your data before transform it. Thus, you ensure the output of the transformation to be positive.
- Log(x+1)

In [None]:
#Log Transform Example
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['log+1'] = (data['value']+1).transform(np.log)
#Negative Values Handling
#Note that the values are different
data['log'] = (data['value']-data['value'].min()+1) .transform(np.log)

## ONE HOT ENCODING AND LABEL ENCODER 

- Refer before project codes

## Grouping Operations 

In [None]:
#Categorical Column Grouping
#select the label with the highest frequency. In other words, this is the max operation for categorical columns,
data.groupby('id').agg(lambda x: x.value_counts().index[0])

#pivot table. This approach resembles the encoding method in the preceding step with a difference. 
#Instead of binary notation, it can be defined as aggregated functions for the values between grouped and encoded columns.
#Pivot table Pandas Example
data.pivot_table(index='column_to_group', columns='column_to_encode', values='aggregation_column', aggfunc=np.sum, fill_value = 0)


#Numerical Column Grouping
#Numerical columns are grouped using sum and mean functions in most of the cases. Both can be preferable according to the meaning of the feature.
#sum_cols: List of columns to sum
#mean_cols: List of columns to average
grouped = data.groupby('column_to_group')

sums = grouped[sum_cols].sum().add_suffix('_sum')
avgs = grouped[mean_cols].mean().add_suffix('_avg')

new_df = pd.concat([sums, avgs], axis=1)

## Feature Split

- Split function is a good option, however, there is no one way of splitting features. It depends on the characteristics of the column, how to split it. Let’s introduce it with two examples. First, a simple split function for an ordinary name column:

In [None]:
data.name
#0  Luther N. Gonzalez
#1    Charles M. Young
#2        Terry Lawson
#3       Kristen White
#4      Thomas Logsdon

#Extracting first names
data.name.str.split(" ").map(lambda x: x[0])
#0     Luther
#1    Charles
#2      Terry
#3    Kristen
#4     Thomas

#Extracting last names
data.name.str.split(" ").map(lambda x: x[-1])
#0    Gonzalez
#1       Young
#2      Lawson
#3       White
#4     Logsdon

- Another case for split function is to extract a string part between two chars. The following example shows an implementation of this case by using two split functions in a row.

In [None]:
#String extraction example
data.title.head()
#0                      Toy Story (1995)
#1                        Jumanji (1995)
#2               Grumpier Old Men (1995)
#3              Waiting to Exhale (1995)
#4    Father of the Bride Part II (1995)
data.title.str.split("(", n=1, expand=True)[1].str.split(")", n=1, expand=True)[0]
#0    1995
#1    1995
#2    1995
#3    1995
#4    1995

## SCALING 

- Normalization (or min-max normalization) scale all values in a fixed range between 0 and 1. This transformation does not change the distribution of the feature and due to the decreased standard deviations, the effects of the outliers increases. Therefore, before normalization, it is recommended to handle the outliers.

In [None]:
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['normalized'] = (data['value'] - data['value'].min()) / (data['value'].max() - data['value'].min())

- Standardization (or z-score normalization) scales the values while taking into account standard deviation. If the standard deviation of features is different, their range also would differ from each other. This reduces the effect of the outliers in the features.

In [None]:
data['standardized'] = (data['value'] - data['value'].mean()) / data['value'].std()

## DATA AND TIME EXTRACTION

- Extracting the parts of the date into different columns: Year, month, day, etc.
- Extracting the time period between the current date and columns in terms of years, months, days, etc.
- Extracting some specific features from the date: Name of the weekday, Weekend or not, holiday or not, etc.
- If you transform the date column into the extracted columns like above, the information of them become disclosed and machine learning algorithms can easily understand them.

In [3]:
from datetime import date
import pandas as pd

data = pd.DataFrame({'date':
['01-01-2017',
'04-12-2008',
'23-06-1988',
'25-08-1999',
'20-02-1993',
]})

#Transform string to date
data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

#Extracting Year
data['year'] = data['date'].dt.year

#Extracting Month
data['month'] = data['date'].dt.month

#Extracting passed years since the date
data['passed_years'] = date.today().year - data['date'].dt.year

#Extracting passed months since the date
data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

#Extracting the weekday name of the date
data['day_name'] = data['date'].dt.day_name()

In [4]:
data

Unnamed: 0,date,year,month,passed_years,passed_months,day_name
0,2017-01-01,2017,1,3,40,Sunday
1,2008-12-04,2008,12,12,137,Thursday
2,1988-06-23,1988,6,32,383,Thursday
3,1999-08-25,1999,8,21,249,Wednesday
4,1993-02-20,1993,2,27,327,Saturday


## Outlier Removal Clustering 

- Outlier Removal Clustering ( ORC ) is a improved version of KMean with outlier removal in each iteration. As we all know that KMean is more sensitive with outliers, and might result into local optimal centroids. For unsupervised clustering KMean is the mainly used algorithm because which is very effective as well as easy to implement. For data which has lot of outliers still works well with KMean if we add outlier removal mechanism in each iteration of the KMean clustering. ORC is the name of the algorithm. This will ensure the centroid calculation won't be skewed by the points far away from the cluster centroid.

In [None]:
# Imports required to understand the dataset, get initial
# intuition of how the data looks like.
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import datasets
%matplotlib inline

# Load the data see its basic characteristics.
df = pd.read_csv("./2d-cluster.csv", index_col=0);
df.describe()

In [None]:
#Lets see the data distribution in terms of historgram
df1 = pd.DataFrame({'x': df.x, 'y': df.y}, columns=['x', 'y'])
plt.figure()
df1.plot.hist(alpha=0.3, stacked=True, bins=10)

In [None]:
#Plot it on 2D plane
# df = pd.read_csv("data_2d.csv", )
from pandas.tools.plotting import scatter_plot
plt.scatter(df.x, df.y, s=20)

In [None]:
#Lets cluter the points with KMean
from sklearn.cluster import KMeans

# df = pd.read_csv("data_2d.csv")
# Fit the data into KMean model, default params are
# KMeans(copy_x=True, init='k-means++', max_iter=300,
#        n_clusters=3, n_init=10, n_jobs=1,
#        precompute_distances='auto',
#        random_state=None, tol=0.0001, verbose=0)
model = KMeans(n_clusters=3, max_iter=300)
model.fit(df)

# Plot the First iteration of the kmean.
colormap = np.array(['red', 'lime', 'blue'])
plt.scatter(df.x, df.y, c=colormap[model.labels_], s=20)

In [None]:
#KMean with Outlier removal (ORC)
from collections import defaultdict
import math

# Cluste size.
K = 3
MAX_ITER = 100;

# List of cluster with its points in it.
CLUSTER = defaultdict(list);

# Kmean model.
model = KMeans(n_clusters=K, max_iter=MAX_ITER)

# Anomoly threshold. Need to be tuned to avoid over / under fitting.
# T = 0.921
T = 0.95

# Data frames loaded from csv.
df = pd.read_csv("./2d-cluster.csv", index_col=0)


def distance(x, y):
    """
    Find distance between two points in a plain.
    @param x: 2D point.
    @param y: 2D point.
    
    @return euclidean distance between this point.
    """
    d1 = x[0] - y[0];
    d2 = x[1] - y[1];
    distance = math.sqrt(d1*d1 + d2*d2)
    return distance


def print_cluster_details(clusters, centroids):
    for index, cluster in clusters.iteritems():
        print "Cluster: {} size: {}".format(index, len(cluster)) 

        
def dump_cluster_points(df, labels):
    """
    @param clusters: dataframe
    
    Dump ponts of the cluster in csv file named as cluster_{#index}.csv
    """
    clusters = aggregate_cluster_points(df, labels)
    for index, cluster in clusters.iteritems():
        with open("cluster_{}.csv".format(index), "w") as f:
            f.write("\n".join(["{},{}".format(p[0], p[1]) for p in cluster]))
            
def aggregate_cluster_points(df, labels):
    """
    Helper methods to aggregate the cluster points based on the label index.
    
    @param df: List of points or datapoints
    @param labels: Cluster index list for each element in points.

    @retrun List of cluster points, indexed with cluster index.
    """
    clusters = defaultdict(list)
    
    for index, value in enumerate(labels):
        clusters[value].append(df.values[index])
        
    return clusters
    

def get_outliers_and_strip_cluster(cluster_points, centroid):
    """
    Apply ODIN algorithm to identify anomalies in the cluster and
    strip it.
    
    Anomaly detection rule:- 
    
    sqrt(point^2 - centroid^2) / max(points) > T === True then it's an anomaly.
    
    @param cluster_points: List of points in this cluster.
    @param centroid: centroid of the cluster.
    @return: outliers, new_cluster
    """
    d_vector = np.array([distance(point, centroid)
                         for point in cluster_points])
    d_max = d_vector.max();
    data = pd.DataFrame([distance(centroid, point) / d_max
                         for point in cluster_points])
#     print data.min(), d_max
    outliers = filter(lambda row: distance(centroid, row) / d_max > T,
                      cluster_points)
    new_cluster = filter(lambda row: distance(centroid, row) / d_max <= T,
                         cluster_points)
#     print "Outlier size", outliers.shape
#     print "New Cluster size: ", new_cluster.shape
#     print "Original cluster size: ", len(cluster_points)
    
    return outliers, new_cluster


def run_outlier_removal_clustering(df, max_iteration):
    """
    Run ORC Outlier removal clustering on the datapoints.
    
    Clustering Algorithm - KMean
    Outlier removal Algorithm - ODIN a Knn based outlier detection.
    """
    orc_model = KMeans(n_clusters=K, max_iter=MAX_ITER)
    OUTLIERS = []
    for iteration in range(max_iteration):
        # Iteration.
        #print "\n\n[{}] ===> Data before clustering: {}, Anomaly: {}".format(
        #iteration, df.shape, len(OUTLIERS))
        orc_model.fit(df)
        labels = orc_model.labels_

        CLUSTER = aggregate_cluster_points(df, labels)
        centroids = orc_model.cluster_centers_

        NEW_CLUSTER = []
        for index, cluster in CLUSTER.iteritems():
            #print "Cluster: {} size: {}".format(index, len(cluster))
            outlier, new_cluster = get_outliers_and_strip_cluster(cluster,
                                                                  centroids[index])

            OUTLIERS.extend(outlier)
            NEW_CLUSTER.extend(new_cluster)

        # Update the cluster with new cluster.
        df = pd.DataFrame(data=NEW_CLUSTER)
        
    # Fit for the one more time, as the when loop exists we removed few anomolies.
    orc_model.fit(df)

    return df, orc_model, OUTLIERS
    
    
# Run Clustering with Outlier removal algorithm.
df, orc_model, outliers = run_outlier_removal_clustering(df, 5)



# Dump the final cluster and anomalies into csv file.
print_cluster_details(aggregate_cluster_points(df, orc_model.labels_),
                      orc_model.cluster_centers_)
print "Total anomalies: {}".format(len(outliers))
print "Exported the cluster and anomalies into csv files"
dump_cluster_points(df, orc_model.labels_)
with open("anomalies.csv", 'w') as f:
    f.write("\n".join(["{},{}".format(p[0], p[1]) for p in outliers]))
    

# Plot the Original and new cluster after anomaly removal.
plt.figure(figsize=(12,4))
colormap = np.array(['red', 'lime', 'blue', 'green', 'yellow'])
df.columns = ['x', 'y']

data = pd.read_csv("./2d-cluster.csv", index_col=0)
plt.subplot(1, 3, 1)
plt.scatter(data.x, data.y, s=20)
plt.title("Without clustering")

plt.subplot(1, 3, 2)
_kmean = model.fit(data)
plt.scatter(data.x, data.y, c=colormap[_kmean.labels_], s=20)
plt.title("KMean Clustering")

plt.subplot(1, 3, 3)
plt.scatter(df.x, df.y, c=colormap[orc_model.labels_], s=20)
plt.title("ORC clustering")

In [None]:
#How I generated this sample cluster data.
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=1000,
                  n_features=2,
                  centers=3,
                  center_box=(-5, 5)
                  )
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)

# How to save the data smaples into csv file.

_d = pd.DataFrame(X)
_d.columns = ['x', 'y']
_d.to_csv("./2d-cluster_new.csv")
#_d = pd.read_csv("./2d-cluster.csv", index_col=0)

#REFERENCES:
#https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
#https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
#https://github.com/SharmaNatasha
# https://github.com/dipanjanS/practical-machine-learning-with-python/tree/master/notebooks/Ch04_Feature_Engineering_and_Selection
#https://haridas.in/outlier-removal-clustering.html