In [None]:
# Step 1: Import necessary libraries
import pandas as pd

# Step 2: Load the dataset (Iris dataset from a URL as an example)
url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
df = pd.read_csv(url)

# Step 3: Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Step 4: Check the structure of the dataset
print("\nDataset Info:")
print(df.info())

# Step 5: Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Step 6: Clean the dataset (if there were any missing values)
# For example, drop rows with missing values (none in Iris dataset, but included for completeness)
df_cleaned = df.dropna()

# Confirm no missing values after cleaning
print("\nMissing values after cleaning:")
print(df_cleaned.isnull().sum())


In [None]:
# Basic statistics of numerical columns
print("Basic Statistics:")
print(df.describe())

# Grouping by a categorical column ('species') and computing the mean of numerical columns
print("\nMean values grouped by species:")
grouped_means = df.groupby('species').mean()
print(grouped_means)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load Iris dataset
url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
df = pd.read_csv(url)

# Simulate a 'Date' column for time-series visualization (e.g., daily observations)
df['Date'] = pd.date_range(start='2023-01-01', periods=len(df), freq='D')

# Set style for seaborn
sns.set(style="whitegrid")

# 1. Line chart (simulated time series of petal length)
plt.figure(figsize=(10, 5))
plt.plot(df['Date'], df['petal_length'], label='Petal Length', color='teal')
plt.title('Petal Length Over Time')
plt.xlabel('Date')
plt.ylabel('Petal Length (cm)')
plt.legend()
plt.tight_layout()
plt.show()

# 2. Bar chart (average petal length per species)
plt.figure(figsize=(8, 5))
sns.barplot(data=df, x='species', y='petal_length', palette='Set2')
plt.title('Average Petal Length per Species')
plt.xlabel('Species')
plt.ylabel('Average Petal Length (cm)')
plt.tight_layout()
plt.show()

# 3. Histogram (distribution of sepal width)
plt.figure(figsize=(8, 5))
sns.histplot(df['sepal_width'], bins=15, kde=True, color='salmon')
plt.title('Distribution of Sepal Width')
plt.xlabel('Sepal Width (cm)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# 4. Scatter plot (sepal length vs petal length, colored by species)
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='sepal_length', y='petal_length', hue='species', palette='viridis')
plt.title('Sepal Length vs Petal Length')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal Length (cm)')
plt.legend(title='Species')
plt.tight_layout()
plt.show()
