# Data Analysis and Visualization in Python


This notebook covers:
1. **Data loading and exploration** using pandas.
2. **Basic data analysis** including statistics and groupings.
3. **Data visualization** using Matplotlib and Seaborn.


In [None]:

import pandas as pd
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Display the first few rows of the dataset
print(df.head())

# Explore the structure of the dataset
print("\nData types:")
print(df.dtypes)

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Clean the dataset by dropping any rows with missing values (if any)
df_cleaned = df.dropna()  # or use df.fillna() to fill missing values


In [None]:

# Compute basic statistics for the numerical columns
print("\nBasic Statistics:")
print(df.describe())

# Group by species and compute the mean of numerical columns for each group
print("\nGroup by species and compute the mean:")
grouped = df.groupby('species').mean()
print(grouped)

# Identifying interesting findings
# For instance, let's check the differences in petal length across species
print("\nPetal length by species:")
print(grouped['petal length (cm)'])


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Line chart - Trends over time (Here, we'll use a dummy time series of sales)
time = pd.date_range(start='1/1/2020', periods=10, freq='M')
sales = [200, 220, 250, 300, 320, 400, 450, 500, 550, 600]
plt.figure(figsize=(10, 6))
plt.plot(time, sales, marker='o')
plt.title("Sales Trend Over Time")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.xticks(rotation=45)
plt.show()

# Bar chart - Average petal length by species
plt.figure(figsize=(10, 6))
sns.barplot(x=grouped.index, y=grouped['petal length (cm)'])
plt.title("Average Petal Length by Species")
plt.xlabel("Species")
plt.ylabel("Average Petal Length (cm)")
plt.show()

# Histogram - Distribution of sepal length
plt.figure(figsize=(10, 6))
sns.histplot(df['sepal length (cm)'], bins=15, kde=True)
plt.title("Distribution of Sepal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Frequency")
plt.show()

# Scatter plot - Sepal length vs Petal length
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['sepal length (cm)'], y=df['petal length (cm)'], hue=df['species'])
plt.title("Sepal Length vs Petal Length")
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Petal Length (cm)")
plt.legend(title='Species')
plt.show()
