1.   Creation and Loading different types of datasets in Python using the required libraries.
2.   Creation using pandas
3. Loading CSV dataset files using Pandas
4. Loading datasets using sklearn
5. Write a python program to compute Mean, Median, Mode, Variance, Standard Deviation using Datasets
6. Demonstrate various data pre-processing techniques for a given dataset.
7. Write a python program to compute
    *   Reshaping the data,
    *   Filtering the data
    *   Merging the data
    *   Handling the missing values in datasets
    *   Feature Normalization: Min-max normalization

In [7]:
import pandas as pd
import numpy as np

# 1.   Creation and Loading different types of datasets in Python using the required libraries.
df = pd.read_csv('dataset/california_housing_train.csv')

# 2. Creation using pandas
data = {'col1': [1, 2, 3, 4], 'col2': [5, 6, 7, 8]}
df_from_dict = pd.DataFrame(data)
print("\nDataFrame created from dictionary:")
print(df_from_dict)

# 3. Loading CSV dataset files using Pandas
df = pd.read_csv('dataset/california_housing_train.csv')

# 4. Loading datasets using sklearn
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

print("\nIris dataset loaded using sklearn (first 5 rows):")
print(iris_df.head())

# 5. Write a python program to compute Mean, Median, Mode, Variance, Standard Deviation using Datasets
print("\nDescriptive Statistics for Iris dataset:")
print(iris_df.describe())

print("\nManual calculation for 'sepal length (cm)':")
print(f"Mean: {iris_df['sepal length (cm)'].mean()}")
print(f"Median: {iris_df['sepal length (cm)'].median()}")
print(f"Mode: {iris_df['sepal length (cm)'].mode().tolist()}") # mode can return multiple values
print(f"Variance: {iris_df['sepal length (cm)'].var()}")
print(f"Standard Deviation: {iris_df['sepal length (cm)'].std()}")

# 6. Demonstrate various data pre-processing techniques for a given dataset.
# Using the Iris dataset
print("\nOriginal Iris DataFrame info:")
iris_df.info()

# 7. Write a python program to compute
#     *   Reshaping the data,
#     *   Filtering the data
#     *   Merging the data
#     *   Handling the missing values in datasets
#     *   Feature Normalization: Min-max normalization

# Reshaping (Example: Melt DataFrame)
print("\nMelted Iris DataFrame (example):")
iris_melted = pd.melt(iris_df, id_vars=['target'], value_vars=['sepal length (cm)', 'sepal width (cm)'])
print(iris_melted.head())

# Filtering the data
print("\nFiltered Iris DataFrame (sepal length > 5.0):")
filtered_df = iris_df[iris_df['sepal length (cm)'] > 5.0]
print(filtered_df.head())

# Merging the data (create a dummy second DataFrame)
data2 = {'target': [0, 1, 2], 'species_name': ['setosa', 'versicolor', 'virginica']}
species_df = pd.DataFrame(data2)

print("\nMerging Iris DataFrame with Species names:")
merged_df = pd.merge(iris_df, species_df, on='target')
print(merged_df.head())

# Handling missing values (introduce some NaNs first)
df_with_nan = iris_df.copy()
df_with_nan.iloc[0:5, 0] = np.nan
df_with_nan.iloc[10:15, 2] = np.nan

print("\nDataFrame with introduced NaNs (head):")
print(df_with_nan.head(15))

# Filling missing values with the mean
df_filled_mean = df_with_nan.fillna(df_with_nan.mean())
print("\nDataFrame with NaNs filled by mean (head):")
print(df_filled_mean.head(15))

# Dropping rows with missing values
df_dropped_nan = df_with_nan.dropna()
print(f"\nOriginal rows: {len(df_with_nan)}, Rows after dropping NaNs: {len(df_dropped_nan)}")
print("DataFrame after dropping NaNs (head):")
print(df_dropped_nan.head())


# Feature Normalization: Min-max normalization
from sklearn.preprocessing import MinMaxScaler

print("\nApplying Min-Max Normalization:")
scaler = MinMaxScaler()

# Select columns to normalize (excluding target)
cols_to_normalize = iris_df.columns.drop('target')

# Apply scaler to the selected columns
iris_df_normalized = iris_df.copy()
iris_df_normalized[cols_to_normalize] = scaler.fit_transform(iris_df[cols_to_normalize])

print("\nIris DataFrame after Min-Max Normalization (head):")
print(iris_df_normalized.head())

print("\nDescriptive Statistics after Normalization:")
print(iris_df_normalized[cols_to_normalize].describe())


DataFrame created from dictionary:
   col1  col2
0     1     5
1     2     6
2     3     7
3     4     8

Iris dataset loaded using sklearn (first 5 rows):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Descriptive Statistics for Iris dataset:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.7652