# Fundamental data analysis in Python using Iris dataset

Let's look at doing some "data analysis" using pandas in Python.

First of all, we need to import the packages we want to use:pandas.

In [1]:
# Importing the Python Library: pandas
# pandas is used as a data analysis and manipulation tool.
# For detailed info on pandas, please go to the website https://pandas.pydata.org/
import pandas as pd

## Opening the data file
Now we need to open the iris data file. We do this using pandas. You can search for these in the pandas help yourself, but the functions of interest are:
- `pandas.read_csv` and `pandas.to_csv` to read and write CSV files respectively.
- `pandas.read_excel` and `pandas.to_excel` to read and write MS Excel files respectively.

We're going to open the iris data file. Note: you will need to edit the code to ensure that it points to where you have downloaded the `iris.xls` file.

In [4]:
# Reading the excel file of Iris dataset 
irisdata_path = "./Dataset/iris.xls"
iris_data = pd.read_excel(irisdata_path)
# Another way of reading the Iris dataset through csv file
# Please open the iris.xls file in excel, save it as iris.csv from file- save as
#iris = pd.read_csv("C:/Keras/iris.csv")

In [5]:
iris_data

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


Details about the Iris Plants Database
====================

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica

## Taking a quick look at the data
You can of course just display the variable or use `.head()` or `.tail()` to see the top or bottom of the dataset.

In [6]:
# Accessing iris data with top few rows. 
# head() function print the first 5 rows as a default manner. It can print more rows for ex: head(10) prints first 10 rows.
iris_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
iris_data.tail()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [8]:
# Using the df.sample() function to get 10 random instances.
iris_data.sample(10)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
28,5.2,3.4,1.4,0.2,setosa
81,5.5,2.4,3.7,1.0,versicolor
49,5.0,3.3,1.4,0.2,setosa
113,5.7,2.5,5.0,2.0,virginica
59,5.2,2.7,3.9,1.4,versicolor
63,6.1,2.9,4.7,1.4,versicolor
26,5.0,3.4,1.6,0.4,setosa
19,5.1,3.8,1.5,0.3,setosa
94,5.6,2.7,4.2,1.3,versicolor
55,5.7,2.8,4.5,1.3,versicolor


In [9]:
iris_data.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [10]:
iris_data['Sepal.Length']

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: Sepal.Length, Length: 150, dtype: float64

In [11]:
# Find all flowers with sepal length less than 6
iris_data['Sepal.Length']<6

0       True
1       True
2       True
3       True
4       True
       ...  
145    False
146    False
147    False
148    False
149     True
Name: Sepal.Length, Length: 150, dtype: bool

In [12]:
#Filter the data to exclude all rows where Sepal.Length> 6
# Notice that any "True" values from above are shown, any False values are excluded
iris_lessthan6 = iris_data[iris_data['Sepal.Length']< 6]
iris_lessthan6

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
113,5.7,2.5,5.0,2.0,virginica
114,5.8,2.8,5.1,2.4,virginica
121,5.6,2.8,4.9,2.0,virginica
142,5.8,2.7,5.1,1.9,virginica


In [13]:
#Filter the data to exclude all rows where Sepal.Length< 6

iris_greater6 = iris_data[iris_data['Sepal.Length']> 6]
iris_greater6

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
54,6.5,2.8,4.6,1.5,versicolor
56,6.3,3.3,4.7,1.6,versicolor
...,...,...,...,...,...
144,6.7,3.3,5.7,2.5,virginica
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica


In [14]:
# Filter the data to exclude all rows where Sepal.Length> 6 AND Sepal.Width> 3

sepal_len_width_subset = iris_data[(iris_data['Sepal.Length'] < 6) & (iris_data['Sepal.Width'] < 3)]
sepal_len_width_subset

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
8,4.4,2.9,1.4,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
53,5.5,2.3,4.0,1.3,versicolor
55,5.7,2.8,4.5,1.3,versicolor
57,4.9,2.4,3.3,1.0,versicolor
59,5.2,2.7,3.9,1.4,versicolor
60,5.0,2.0,3.5,1.0,versicolor
64,5.6,2.9,3.6,1.3,versicolor
67,5.8,2.7,4.1,1.0,versicolor
69,5.6,2.5,3.9,1.1,versicolor


In [15]:
# Filter the data to exclude all rows where Sepal.Length> 6 OR Sepal.Width> 3

sepal_len_or_width = iris_data[(iris_data['Sepal.Length']< 6) | (iris_data['Sepal.Width'] < 3)]
sepal_len_or_width

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
133,6.3,2.8,5.1,1.5,virginica
134,6.1,2.6,5.6,1.4,virginica
142,5.8,2.7,5.1,1.9,virginica
146,6.3,2.5,5.0,1.9,virginica


In [16]:
# Set a rule to change the categorial value of Species to a number 

iris_data['Species'].replace(to_replace=['setosa', 'versicolor', 'virginica'], value=[1, 2, 3], inplace=True)
iris_data

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,3
146,6.3,2.5,5.0,1.9,3
147,6.5,3.0,5.2,2.0,3
148,6.2,3.4,5.4,2.3,3


In [17]:
# Filter the data to exclude all rows where Species = verginica

iris_sub_species = iris_data[iris_data['Species']!=3]
iris_sub_species

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
95,5.7,3.0,4.2,1.2,2
96,5.7,2.9,4.2,1.3,2
97,6.2,2.9,4.3,1.3,2
98,5.1,2.5,3.0,1.1,2


# Combining Pandas with General Python

In [18]:
# Print average attributes per species
# This is somewhat beyond week2 content, especially if you've never used Python before,
#  I've included it to show how powerful the combination of general programming syntax with
#  Pandas functions can be.

#for i in range(1,4):
#    print(f'{i}\n{iris_data[iris_data["Species"] == i].mean()}')