# Import packages

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
import seaborn as sns
from scipy import stats
from scipy.stats import zscore
import matplotlib.pyplot as plt
import numpy as np

sns.set(color_codes=True)
pd.options.display.max_colwidth = 500

# Import required data

In [None]:
seg_data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.data' 
segmentation_data = pd.read_csv(seg_data_url, sep=',', skiprows=2)
print(segmentation_data.shape)

In [None]:
segmentation_data.head()

In [None]:
seg_test_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/image/segmentation.test' 
segmentation_test = pd.read_csv(seg_test_url, sep=',', skiprows=2)
print(segmentation_test.shape)

In [None]:
segmentation_test.head()

# Merge data into one file

In [None]:
data = pd.concat([segmentation_data, segmentation_test])

In [None]:
# add index and class column
data= data.reset_index()
data= data.rename(columns={'index': 'Class'})
print(data.shape)

In [None]:
data.head()

### number of instances  = 2310

In [None]:
print(data.shape[0])

### number of attributes = 19
*one column for className* 

In [None]:
print(data.shape[1])

### number of classes = 7

In [None]:
classes = data['Class'].unique()
print("class names : ",  classes)
print("number of classes: ", len(classes))

# Data Explortions

## Histogram plots

In [None]:
def get_list_of_lists(class_name):
    
    # select data with specefic class_name
    new_data = data.loc[data['Class'] == class_name]
    data_to_plot = []
    
    # make each columns as list and return data as list of lists
    i = 1
    while i < len(new_data.columns):
        column_list = list(new_data.iloc[:,i])
        data_to_plot.append(column_list)
        i = i + 1
    return data_to_plot

In [None]:
# prepare colors and lablel names of the images
list1 = sns.color_palette("husl", 7)
list1 += sns.color_palette("Paired")
colors = list1
column_names = list(data.columns.values)[1:]
print(column_names)

In [None]:
# plot for different bin sizes
for j, bin in enumerate([1, 5, 10, 15]):
    f = plt.figure(figsize=(40,100))
    # plot for each class
    for i, class_name in enumerate(classes):
        
        # Set up the plot
        ax = f.add_subplot(42*10+i+1)
        
        # Draw the plot
        data_to_plot = get_list_of_lists(class_name)
        ax.hist(data_to_plot, bins = bin, density = True, color = colors, label = column_names)
        
        # Title and labels
        ax.set_title('Class = %s' %class_name, size = 30)
        ax.set_xlabel('Attributes', size = 22)
        ax.set_ylabel('Values', size= 22)
        ax.legend()

    f.suptitle('Histogram with Bins = %d' % bin, size = 50)
    plt.show()

## Boxblot

In [None]:
boxplot = data.boxplot(column=column_names, figsize=(40,40))

## Correlation Matrix

In [None]:
def correlation_heatmap(data):
    correlations = data.corr(method='pearson')
    fig, ax = plt.subplots(figsize=(20,20))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
    plt.show();
    
correlation_heatmap(data)

# Preprocessing

## Normalization

### Min-Max scaler

In [None]:
# skip applying 'Min-Max scaler' on constant columns and string ones
cols = data.columns.difference(['Class', 'REGION-PIXEL-COUNT'])
x = data[cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data_normalized_Min_Max_scaler = pd.DataFrame(x_scaled, columns = cols)
data_normalized_Min_Max_scaler.insert(0,'Class',data.iloc[:,0].values, True)
data_normalized_Min_Max_scaler.insert(3,'REGION-PIXEL-COUNT',data.iloc[:,3].values, True)
data_normalized_Min_Max_scaler

### Box plot 

In [None]:
boxplot = data_normalized_Min_Max_scaler.boxplot(column=column_names, figsize=(40,40))

### Conclusion

- The result data spread on a very small scale, so values are more close to each other.
- With min-max normalization, we were guaranteed to reshape both of our features to be between 0 and 1.
- doesn't handle outliers

### Z-score

In [None]:
cols = data.columns[1:]
x = data[cols] 
# skip applying 'z-score' on constant columns
x_scaled = x.apply(lambda x: x if np.std(x) == 0 else zscore(x)) 
data_normalized_Z_score = pd.DataFrame(x_scaled, columns = cols)
data_normalized_Z_score.insert(0,'Class',data.iloc[:,0].values, True)
data_normalized_Z_score

### Box plot

In [None]:
boxplot = data_normalized_Z_score.boxplot(column=column_names, figsize=(40,40))

### Conclusion

 - The result data spread on less scale, so valuse are more close to each other.
 -  A value is exactly equal to the mean of all the values of the feature, it will be normalized to 0. If it is below the mean, it will be a negative number, and if it is above the mean it will be a positive number

# Dimensionality reduction

## Feature Projection

In [None]:
# initiate columns names for pca and feture selection
pca_columns =[] 
feature_columns = []
for i in range(1,11):
    pca_columns.append('principal component {}'.format(i))
    feature_columns.append('Feture {}'.format(i))

In [None]:
# pass desired sum of varince ratio we need to be captured 
pca = PCA(0.95)
principalComponents = pca.fit_transform(data_normalized_Z_score.iloc[:,1:])

In [None]:
pca.n_components_

### variance ratio

In [None]:
pca.explained_variance_ratio_

In [None]:
principalComponents

### Correlation matrix Visualization

In [None]:
pca_data = pd.DataFrame(principalComponents, columns = pca_columns)
pca_data.insert(0,'Class',data.iloc[:,0].values, True)
correlation_heatmap(pca_data)

### Conclusion 

- Principal component analysis convert a set of observations of correlated variables into a set of values of linearly uncorrelated variables called principal components

- less than the half of attributes cn describe the data with 0.95 percent  


## Feature Selection
applied on 'z-score' normalized data

In [None]:
sk_best = SelectKBest(k=10)
sk_best_data = sk_best.fit_transform(data_normalized_Z_score.iloc[:,1:],data_normalized_Z_score.iloc[:,0])

In [None]:
sk_best.scores_

### Correlation matrix Visualization

In [None]:
sk_best_data = pd.DataFrame(sk_best_data, columns = feature_columns)
sk_best_data.insert(0,'Class',data.iloc[:,0].values, True)
correlation_heatmap(sk_best_data)

### Conclusion

- Feature Selection select those features which contribute most to your prediction variable or output in which you are interested in.

- These features are highly correlated  