Question 1: Understanding the Dataset 
<br>
Description: Load a dataset and understand its basic properties including data types dimensions, and first few rows

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load California Housing dataset from sklearn
data = fetch_california_housing(as_frame=True)  # as_frame=True returns a pandas DataFrame

# Extract DataFrame and target
df = data.frame

# Basic properties
print("=== Dataset shape ===")
print(df.shape)  # rows and columns

print("\n=== Data types ===")
print(df.dtypes)  # data type of each column

print("\n=== First 5 rows ===")
print(df.head())  # preview first 5 rows


Question 2: Checking for Missing Values
<br>
Description: Identify missing values in the dataset.

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Check for missing values
missing_values = df.isnull().sum()

print("Missing values in each column:")
print(missing_values)


Question 3: Descriptive Statistics
<br>
Description: Calculate descriptive statistics for numerical columns.

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Calculate descriptive statistics
desc_stats = df.describe()

print("Descriptive statistics for numerical columns:")
print(desc_stats)


Question 4: Handling Outliers
<br>
Description: Identify outliers in numerical columns using box plots.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Select numerical columns (all columns here are numeric)
numerical_cols = df.columns

# Plot boxplots for each numerical column
plt.figure(figsize=(15, 10))

for i, col in enumerate(numerical_cols, 1):
    plt.subplot(4, 4, i)  # adjust subplot grid size depending on number of features
    sns.boxplot(x=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()



Question 5: Categorical Data Analysis
<br>
Description: Explore the counts of categorical variables.

In [None]:
import pandas as pd

# Example DataFrame with categorical columns
# For demonstration, let's create a sample categorical DataFrame
df = pd.DataFrame({
    'Category1': ['A', 'B', 'A', 'C', 'B', 'A'],
    'Category2': ['X', 'X', 'Y', 'Y', 'X', 'Z'],
    'Numeric': [1, 2, 3, 4, 5, 6]
})

# Select categorical columns (object or category dtype)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    print(f"Value counts for '{col}':")
    print(df[col].value_counts())
    print()


Question 6: Data Transformation
<br>
Description: Transform a categorical column into numerical using Label Encoding.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample DataFrame with a categorical column
df = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red', 'Green']
})

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the 'Color' column
df['Color_encoded'] = le.fit_transform(df['Color'])

print("Original Data:")
print(df[['Color']])

print("\nLabel Encoded Data:")
print(df[['Color_encoded']])


Question 7: Visualizing Data Distributions
<br>
Description: Plot histograms for numerical columns to understand distributions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Select numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns

# Plot histograms
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(4, 4, i)  # Adjust grid size depending on number of features
    plt.hist(df[col], bins=30, color='skyblue', edgecolor='black')
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


Question 8: Correlation Analysis
<br>
Description: Calculate and visualize the correlation matrix for numerical features.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Calculate correlation matrix
corr_matrix = df.corr()

print("Correlation matrix:")
print(corr_matrix)

# Visualize correlation matrix as heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title("Correlation Matrix Heatmap")
plt.show()


Question 9: Feature Engineering
<br>
Description: Create a new feature by combining or transforming existing features.

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Create a new feature: Rooms per Occupant
df['Rooms_per_Occupant'] = df['AveRooms'] / df['AveOccup']

# Show the first few rows
print(df[['AveRooms', 'AveOccup', 'Rooms_per_Occupant']].head())


Question 10: Advanced Outlier Detection
<br>
Description: Use the Z-score method to identify and handle outliers.

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from scipy.stats import zscore

# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Calculate Z-scores for numerical columns
z_scores = df.apply(zscore)

# Define a threshold for identifying outliers
threshold = 3

# Find outliers: True if any column's Z-score > threshold (positive or negative)
outliers = (np.abs(z_scores) > threshold).any(axis=1)

print(f"Number of outliers detected: {outliers.sum()}")

# Optional: Remove outliers
df_no_outliers = df[~outliers]

print(f"Shape before removing outliers: {df.shape}")
print(f"Shape after removing outliers: {df_no_outliers.shape}")
