In [None]:
# Step 1: Set Up the Environment
# Objective: Ensure you have the right tools and packages installed.

# 1. Install the required libraries.
# 2. Verify the installation by importing the libraries in a Python script or Jupyter notebook





# Step 2: Load & Explore the Dataset
# Objective: Load data into a pandas DataFrame and obtain a basic understanding of its structure.

# 3. Load a CSV file into a DataFrame.
# 4. Display the first few records to understand the structure.
# 5. Get a summary of the dataset.






# Step 3: Perform NumPy Operations
# Objective: Utilize NumPy for basic numerical operations and array manipulations.

# 6. Convert a DataFrame column to a NumPy array and perform array operations like mean and sum.
# 7. Create a NumPy array and calculate the variance and standard deviation.
# 8. Use NumPy to filter based on conditions.








# Step 4: Data Manipulation with Pandas
# Objective: Use Pandas to clean and manipulate dataset for analysis.

# 9. Handle missing data by filling or dropping.
# 10. Create new columns or modify existing ones.
# 11. Use groupby to aggregate data.







# Step 5: Data Visualization with Matplotlib & Seaborn
# Objective: Visualize the data to identify patterns, trends, and insights.

# 12. Use Matplotlib to create a basic plot.
# 13. Create a histogram using Seaborn.
# 14. Plot a box plot for a clear view of data distribution.



# Step 1: Set Up the Environment
# Objective: Ensure you have the right tools and packages installed.

# 1. Install the required libraries
# You can run this in your terminal or Jupyter cell (uncomment if needed)
# !pip install numpy pandas matplotlib seaborn

# 2. Verify the installation by importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")




# Step 2: Load & Explore the Dataset
# Objective: Load data into a pandas DataFrame and obtain a basic understanding of its structure.

# 3. Load a CSV file into a DataFrame (replace 'your_dataset.csv' with actual file path)
# For demonstration, we will load the iris dataset from seaborn as an example
df = sns.load_dataset("iris")  # You can replace this with pd.read_csv('your_dataset.csv')

# 4. Display the first few records
print("\nFirst 5 rows of the dataset:")
print(df.head())

# 5. Get a summary of the dataset
print("\nDataset info:")
print(df.info())

print("\nStatistical summary:")
print(df.describe())




# Step 3: Perform NumPy Operations
# Objective: Utilize NumPy for basic numerical operations and array manipulations.

# 6. Convert a DataFrame column to a NumPy array and perform array operations like mean and sum
petal_length_array = df['petal_length'].to_numpy()
print("\nMean of petal_length:", np.mean(petal_length_array))
print("Sum of petal_length:", np.sum(petal_length_array))

# 7. Create a NumPy array and calculate the variance and standard deviation
sample_array = np.array([10, 20, 30, 40, 50])
print("\nSample array variance:", np.var(sample_array))
print("Sample array standard deviation:", np.std(sample_array))

# 8. Use NumPy to filter based on conditions
filtered_array = petal_length_array[petal_length_array > 4.0]
print("\nFiltered petal lengths > 4.0:", filtered_array)




# Step 4: Data Manipulation with Pandas
# Objective: Use Pandas to clean and manipulate dataset for analysis.

# 9. Handle missing data by filling or dropping
# Let's introduce some missing values for demonstration
df_missing = df.copy()
df_missing.loc[0:3, 'sepal_length'] = np.nan

print("\nMissing values before filling:")
print(df_missing.isnull().sum())

# Fill missing values with median
df_missing['sepal_length'].fillna(df_missing['sepal_length'].median(), inplace=True)

print("\nMissing values after filling:")
print(df_missing.isnull().sum())

# 10. Create new columns or modify existing ones
df['petal_sepal_ratio'] = df['petal_length'] / df['sepal_length']
print("\nAdded new column 'petal_sepal_ratio':")
print(df[['petal_length', 'sepal_length', 'petal_sepal_ratio']].head())

# 11. Use groupby to aggregate data
grouped = df.groupby('species').mean()
print("\nGrouped mean values by species:")
print(grouped)




# Step 5: Data Visualization with Matplotlib & Seaborn
# Objective: Visualize the data to identify patterns, trends, and insights.

# 12. Use Matplotlib to create a basic plot
plt.figure(figsize=(8, 5))
plt.plot(df['sepal_length'], df['sepal_width'], 'o-', color='blue')
plt.title("Sepal Length vs Sepal Width")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.grid(True)
plt.show()

# 13. Create a histogram using Seaborn
plt.figure(figsize=(8, 5))
sns.histplot(df['petal_length'], bins=20, kde=True, color='green')
plt.title("Distribution of Petal Length")
plt.xlabel("Petal Length")
plt.ylabel("Count")
plt.show()

# 14. Plot a box plot for a clear view of data distribution
plt.figure(figsize=(8, 5))
sns.boxplot(x='species', y='petal_length', data=df, palette='Set2')
plt.title("Boxplot of Petal Length by Species")
plt.show()


