# Data Exploration - Iris Dataset
In this notebook we perform basic data exploration on the Iris data set:

https://en.wikipedia.org/wiki/Iris_flower_data_set

First, we load the libraries we need for the analysis.

In [4]:
# dataframe management
import pandas as pd             

# numerical computation
import numpy as np

# visualization library
import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   


# import matplotlib and allow it to plot inline
import matplotlib.pyplot as plt
%matplotlib inline

# seaborn can generate several warnings, we ignore them
import warnings 
warnings.filterwarnings("ignore")

# import the dataset library
from sklearn import datasets

Let's load the Iris dataset

In [5]:
dataset = datasets.load_iris()

In [7]:
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [None]:
# create data with input values
iris = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [None]:
# create target variable
iris["Species"] = dataset.target_names[dataset.target]

In [None]:
target_variable = 'Species'
input_variables = iris.columns[iris.columns!=target_variable]

Let's get some statistics for continuous attributes

In [None]:
iris.describe()

And some statistics about the class attribute Species. We can use barplot to show the number of instances belonging to each class. As we can see, the dataset is completely balanced with 50 cases for each class.

In [None]:
sns.barplot(x=iris[target_variable].unique(),y=iris[target_variable].value_counts().sort_index());

We can use barplots also to plot summary statistics for each class value. For example, we can plot compute the mean values of each attribute.

In [None]:
plt.figure(figsize=(12, 9));
iris_gb=iris.groupby([target_variable]).mean();
iris_gb.plot(kind="bar");
plt.grid(color='black', linestyle='--', linewidth=.5);
plt.yticks(np.arange(0, 7, step=1.0));
plt.xlabel("");
plt.legend(loc='upper right',bbox_to_anchor=(1.7, 1.1));

Let's plot the distribution of SepalLengthCm 

In [None]:
from scipy.stats import iqr

numerical_variables = iris.columns[iris.columns!='Species']

print('Variable Range')
for c in numerical_variables:
    print('%s\t%.3f'%(c,np.max(iris[c]) - np.min(iris[c])))

print('\n\nInterquartile Range')
for c in numerical_variables:
    print('%s\t%.3f'%(c,iqr(iris[c])))


We can compute correlations among attributes.

In [None]:
corrmat = iris.corr()
plt.figure(figsize=(12,9))
sns.heatmap(corrmat, square=True, cmap="Blues",annot=True);

# these lines are here only to correct a matplotlib bug
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
#



We can further analyze the relations among variables by using clustermaps on the correlation matrix

In [None]:
plt.figure(figsize=(8,6))
sns.clustermap(iris.corr(), square=True, annot=True, cmap="Blues");
# these lines are here only to correct a matplotlib bug
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t); # update the ylim(bottom, top) values
#


Or over the original dataset

In [None]:
cm = sns.clustermap(iris[numerical_variables], center=0, cmap="Blues", figsize=(8, 8),yticklabels=False)
cm.cax.set_visible(False)

We check how features vary with each data input. The plot shows a sorting in the input values. 

In [None]:
plt.figure(figsize=(8,6))
for feature in iris.columns[0:4]:
    plt.plot(iris[feature], label = feature)
plt.legend(loc='best');

In [None]:
plt.figure(figsize=(8, 6))
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
dp = sns.distplot(iris['sepal length (cm)'],kde=False,bins=20)
dp.set_title('Distribution and Density of sepal length (20 bins)');
plt.tight_layout();
plt.grid(axis='x')
plt.xlim([4,8])
plt.yticks(np.arange(0, 20, step=2.0));

We can add the kernel density estimator to the plot, although it might not provide reliable information.

In [None]:
plt.figure(figsize=(8, 6))
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
dp = sns.distplot(iris['sepal length (cm)'],bins=20)
dp.set_title('Distribution and Density of Sepal Length (20 bins)');
plt.grid(axis='x')
plt.xlim([4,8])
plt.tight_layout();

In [None]:
plt.figure(figsize=(8, 6))
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
dp = sns.distplot(iris['sepal length (cm)'],bins=10)
dp.set_title('Distribution and Density of Sepal Length (10 bins)');
plt.grid(axis='x')
plt.xlim([4,8])
plt.tight_layout();

In [None]:
plt.figure(figsize=(8, 6))
iris_is = iris[iris['Species'] == 'setosa']
hist1 = sns.distplot(iris_is['sepal length (cm)'],bins=10)
hist1.set_title('Distribution and Density of Sepal Length for setosa');
plt.xlim([0,8])
plt.ylim([0,2])
plt.yticks(np.arange(0,2,0.5))
plt.grid(axis='x')

In [None]:
plt.figure(figsize=(8, 6))
iris_is = iris[iris['Species'] == 'setosa']
hist2 = sns.distplot(iris_is['sepal width (cm)'],bins=10)
hist2.set_title('Distribution and Density of Sepal Width for setosa');
plt.xlim([0,8])
plt.ylim([0,2])
plt.yticks(np.arange(0,2,0.5))
plt.grid(axis='x')

We can plot the distribution for each class.

In [None]:
plt.figure(figsize=(8, 6))
sns.FacetGrid(iris, hue="Species", size=6) \
   .map(sns.kdeplot, "sepal length (cm)") \
   .add_legend();

And now we use scatter plots.

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.pairplot(iris, x_vars=["sepal length (cm)"], y_vars=["sepal width (cm)"], size=5).add_legend()
plt.grid(False)

We can also add the information about the class.

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sns.pairplot(iris, x_vars=["sepal length (cm)"], y_vars=["sepal width (cm)"], hue="Species", size=5).add_legend();
plt.grid(False)

We can combine histograms and histograms in the same figure.

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5});
sns.jointplot(x="sepal length (cm)", y="sepal width (cm)", data=iris);
plt.grid(False);

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1})
sns.boxplot(x="Species", y="sepal length (cm)", data=iris);

In [None]:
iris.boxplot(by="Species", figsize=(12, 6));

We can also add the scatter plot for every boxplot.

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1})
ax = sns.boxplot(x="Species", y="sepal length (cm)", data=iris)
ax = sns.stripplot(x="Species", y="sepal length (cm)", data=iris, jitter=True, edgecolor="gray");

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1})
sns.violinplot(x="Species", y="sepal length (cm)", data=iris);

We can plot the scatter plots for the pairwise attribute combinations.

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
sns.pairplot(iris, hue="Species", size=3, diag_kind="hist");

We can replace the diagonal bar plots with a gaussian kernel density estimate.

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
sns.pairplot(iris, hue="Species", size=3, diag_kind="kde");

## Principal Component Analysis

So far we used only some data dimensions for visualization. We now apply Principal Component Analysis to project the four original dimensions into a two dimensional space.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
x = iris.loc[:, numerical_variables].values
y = iris.loc[:,['Species']].values

Principal component analysis is affected by attribute scale so we normalize all the attributes by eliminating the mean and scaling to unit variance.

In [None]:
x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(n_components=2)
new_data = pca.fit_transform(x)
pca_iris = pd.DataFrame(data = new_data, 
                        columns = ['principal component 1', 'principal component 2'])

In [None]:
pca_iris_complete = pca_iris
pca_iris_complete['Species'] = iris[['Species']]

In [None]:
plt.figure(figsize = (8,8))
plt.xlabel('Principal Component 1', fontsize = 15)
plt.ylabel('Principal Component 2', fontsize = 15)
plt.title('Principal Component Analysis (2 Components)', fontsize = 20)
targets = dataset.target_names
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = pca_iris_complete['Species'] == target
    plt.scatter(pca_iris_complete.loc[indicesToKeep, 'principal component 1']
               , pca_iris_complete.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
# ax.legend(targets)
plt.axis([-3,3,-3,3])
plt.grid()

In [None]:
print("Explained Variance")
print("   Component 1 %3.2f"%(pca.explained_variance_ratio_[0]))
print("   Component 2 %3.2f"%(pca.explained_variance_ratio_[1]))
print("   Total Explained Variance %3.2f"%sum(pca.explained_variance_ratio_))


In [None]:
print("Components")
for i,c in enumerate(pca.components_):
    print("Component %d\t%s"%(i,str(c)))

In [None]:
data = np.dot(x,np.transpose(pca.components_))
data[:5,:]

In [None]:
pca_iris.head(5)

We can apply PCA with the same number of components as the 

In [None]:
x = iris.loc[:, numerical_variables].values
y = iris.loc[:,['Species']].values
full_pca = PCA()
fitted = full_pca.fit_transform(x)
full_pca.explained_variance_ratio_

# t-SNE

In [None]:
from sklearn.manifold import TSNE

perplexity=80
tsne = TSNE(n_components=2, verbose=1, perplexity=perplexity, n_iter=300, random_state=2867976)
# tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=300)
tsne_result = tsne.fit_transform(x)

In [None]:
iris_tsne = pd.DataFrame({'x':tsne_result[:,0], 'y':tsne_result[:,1], 'Species':iris['Species']})

In [None]:
iris_tsne

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('x', fontsize = 15)
ax.set_ylabel('y', fontsize = 15)
ax.set_title('t-SNE (2 Components)', fontsize = 20)
targets = dataset.target_names
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = iris_tsne['Species'] == target
    ax.scatter(iris_tsne.loc[indicesToKeep,'x'], iris_tsne.loc[indicesToKeep,'y'], c=color, s=50)
ax.legend(targets)
plt.axis([-3,3,-3,3])
ax.grid()