## **Analyzing the Iris dataset using Python**

### Import Python packages:

In [1]:

# packages will be updated according to the task progression.

import csv
# for dataframes 
import pandas as pd
# machine Learning Library that contains datasets.
import sklearn as skl 



### Output a summary of each Iris dataset variable to a single text file  


* Load dataset

In [2]:
# Load the iris dataset.
# using the file path as I already downloaded the IRIS dataset in csv format.
# the method is explained in Pandas documentation 
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#
# pandas.read_csv(filepath_or_buffer, *, sep=<no_default>, delimiter=None  )

* Import dataset previously downloaded

In [3]:
# as we already imported pd and downloaded the dataset, I went to the folder where the file is saved, and through properties I did copy the file path.
# assigning a value 
file_path = (r'C:\Users\marou\Desktop\pands\pands-project\iris dataset\iris.data')
# read the data
iris_data = pd.read_csv(file_path )

* Dataframe

In [4]:
# show dataframe
# dataset shape shows 5 columns with 149 rows. I can see that the target column (species) of the dataset is included, while the column names are not. 
# reference : https://archive.ics.uci.edu/dataset/53/iris
iris_data

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
# add meaningful columns names
# https://archive.ics.uci.edu/dataset/53/iris ( see variables table )
column_names = ['sepal length in cm' , 'sepal width in cm' , 'petal length in cm' , 'petal width in cm', 'species']
iris_data = pd.read_csv(file_path, header= None, names=column_names)
# show data frame
iris_data

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


* Summary of each Iris dataset variable to a single text file  

In [6]:
# summary of the 4 variables of Iris dataset.
# The describe() method returns description of the data in the dataframe. As the dataframe contains numerical data, the description contains these information for each column: 
# count, mean, std, min, 25%, 50%, 75%, max
# https://www.w3schools.com/python/pandas/ref_df_describe.asp#:~:text=The%20describe()%20method%20returns,The%20average%20(mean)%20value.   (see .describe() method )
iris_data.describe()


Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
# I can also access separatly and output a summary of each variable of iris dataset
# examples :
# sepal length variable
sepal_length = iris_data['sepal length in cm']
# summary of sepal length 
sepal_length_summary = sepal_length.describe()
#show
sepal_length_summary



count    150.000000
mean       5.843333
std        0.828066
min        4.300000
25%        5.100000
50%        5.800000
75%        6.400000
max        7.900000
Name: sepal length in cm, dtype: float64

In [9]:
# create a variable ' summary ' to store the result of dataframe describe method. 
summary = iris_data.describe()
# save the iris dataset variables summary into a .txt 
summary.to_csv('summary_iris.txt' , ',',  header=True, index=True)

# showing the header in txt file by setting the parameter header to True, hiding the index by setting it to False
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html ( explore .read_csv() documentation )



  summary.to_csv('summary_iris.txt' , ',',  header=True, index=True)
