In [1]:
import pandas as pd

# Task 1: Load and Explore the Dataset

# Loading the dataset
dataset_path = 'iris.csv'
data = pd.read_csv(dataset_path)

# To display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# To explore the structure of the dataset
print("\nDataset Info:")
print(data.info())

# To check for missing values
print("\nMissing values in the dataset:")
print(data.isnull().sum())

# Cleaning and dropping missing values
data_cleaned = data.dropna()

print("\nDataset after cleaning:")
print(data_cleaned.head())

First few rows of the dataset:
   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

Missing values in the dataset:
sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
vari

In [2]:
import numpy as np
import pandas as pd

# Load the dataset
iris = pd.read_csv('iris.csv')

# Basic statistics for numerical columns
print("Basic Statistics:")
print(iris.describe())

# Group by variety and compute mean of numerical columns
print("\nMean by Variety:")
print(iris.groupby('variety').mean())

# Additional statistics by variety
print("\nAdditional Statistics by Variety:")
for column in ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']:
    print(f"\nColumn: {column}")
    for variety in iris['variety'].unique():
        group = iris[iris['variety'] == variety][column]
        print(f"{variety}:")
        print(f"  Mean: {np.mean(group):.2f}")
        print(f"  Median: {np.median(group):.2f}")
        print(f"  Std Dev: {np.std(group):.2f}")
        print(f"  Range: {np.ptp(group):.2f}")

# Interesting findings
print("\nInteresting Findings:")
print("1. Setosa has the smallest petal dimensions (length 1.46, width 0.24 on average)")
print("2. Virginica has the largest petals (length 5.55, width 2.03 on average)")
print("3. Setosa has the widest sepals (mean width 3.43) despite having the shortest sepal length")
print("4. Versicolor is intermediate in all measurements between Setosa and Virginica")
print("5. Petal measurements show clearer separation between varieties than sepal measurements")

Basic Statistics:
       sepal.length  sepal.width  petal.length  petal.width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

Mean by Variety:
            sepal.length  sepal.width  petal.length  petal.width
variety                                                         
Setosa             5.006        3.428         1.462        0.246
Versicolor         5.936        2.770         4.260        1.326
Virginica          6.588        2.974         5.552        2.026

Additional Statistics by Variety:

Column: sepal.length
Setosa:
  Mean: 5.01
  Median: 5.00
  Std 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
iris = pd.read_csv('iris.csv')

# Convert any infinite values to NaN (handles potential future warnings)
iris.replace([np.inf, -np.inf], np.nan, inplace=True)

## Visualization 1: Line Chart (Trend of Measurements by Index)
plt.figure(figsize=(12, 6))
plt.plot(iris.index, iris['sepal.length'], label='Sepal Length', marker='o', markersize=3)
plt.plot(iris.index, iris['petal.length'], label='Petal Length', marker='o', markersize=3)
plt.title('Trend of Sepal and Petal Length Measurements', fontsize=14)
plt.xlabel('Observation Index', fontsize=12)
plt.ylabel('Length (cm)', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Visualization 2: Bar Chart (Average Measurements by Variety)
plt.figure(figsize=(10, 6))
iris.groupby('variety').mean().plot(kind='bar')
plt.title('Average Measurements by Iris Variety', fontsize=14)
plt.xlabel('Variety', fontsize=12)
plt.ylabel('Measurement (cm)', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title='Measurement', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Visualization 3: Histogram (Distribution of Sepal Length)
plt.figure(figsize=(10, 6))
# Explicitly convert inf values to NaN before plotting
iris_clean = iris.copy()
iris_clean['sepal.length'] = iris_clean['sepal.length'].replace([np.inf, -np.inf], np.nan)
sns.histplot(data=iris_clean, x='sepal.length', hue='variety', element='step', kde=True)
plt.title('Distribution of Sepal Length by Variety', fontsize=14)
plt.xlabel('Sepal Length (cm)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Visualization 4: Scatter Plot (Sepal Length vs Petal Length)
plt.figure(figsize=(10, 6))
# Clean data for scatter plot
iris_clean = iris.copy()
iris_clean[['sepal.length', 'petal.length']] = iris_clean[['sepal.length', 'petal.length']].replace([np.inf, -np.inf], np.nan)
sns.scatterplot(data=iris_clean, x='sepal.length', y='petal.length', hue='variety', 
                style='variety', s=100, alpha=0.8)
plt.title('Sepal Length vs Petal Length by Variety', fontsize=14)
plt.xlabel('Sepal Length (cm)', fontsize=12)
plt.ylabel('Petal Length (cm)', fontsize=12)
plt.legend(title='Variety', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()