# Exploratory Data Analysis (EDA)

## Import Libraries

In [1]:
from collections import namedtuple
import gin
import logging
import sys

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from mlops.data.data_loader import DataLoader

### Configuration

In [2]:
# Clear any existing logging handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
    
# Set the logging level and format
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', 
                    level=logging.INFO,
                    stream=sys.stdout)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.info("Logger Configured")

2024-10-01 13:41:50,451 | INFO : Logger Configured


In [3]:
# Load the configuration using gin
gin.enter_interactive_mode()
Args = namedtuple('Args', ['config'])
args = Args(config='..\..\mlops\config\config.gin')
gin.parse_config_file(args.config)
gin.bind_parameter('DataLoader.reload', False)

## Load Data

In [4]:
# Initialize the DataLoader instance using the gin configuration
data_loader = DataLoader()

# Load the data using the DataLoader
amphibians_df  = data_loader.load()

2024-10-01 13:41:50,485 | INFO : Base directory set to: C:\Users\arman\PycharmProjects\MLOps
2024-10-01 13:41:50,487 | INFO : Resolved local_csv_path to: None
2024-10-01 13:41:50,488 | INFO : Resolved local_zip_path to: C:\Users\arman\PycharmProjects\MLOps\data\amphibians.zip
2024-10-01 13:41:50,488 | INFO : Resolved extract_dir to: C:\Users\arman\PycharmProjects\MLOps\data\extracted
2024-10-01 13:41:50,490 | INFO : Reload is set to False and zip file already exists. Using existing zip file: C:\Users\arman\PycharmProjects\MLOps\data\amphibians.zip
2024-10-01 13:41:50,494 | INFO : Files in the archive: ['dataset.csv']
2024-10-01 13:41:50,495 | INFO : The file 'amphibians.csv' already exists. Skipping rename.
2024-10-01 13:41:50,496 | ERROR : An error occurred: name 'extracted_file_name' is not defined


NameError: name 'extracted_file_name' is not defined

## Preliminary Data Analysis

In [None]:
amphibians_df .head()

In [None]:
amphibians_df.info()

## Initial Data Preprocessing

In [None]:
# Rename columns based on the considerations provided
amphibians_df.columns = ['ID', 'MV', 'SR', 'NR', 'TR', 'VR', 'SUR1', 'SUR2', 'SUR3', 'UR', 'FR', 
                         'OR', 'RR', 'BR', 'MR', 'CR', 'Green frogs', 'Brown frogs', 
                         'Common toad', 'Fire-bellied toad', 'Tree frog', 
                         'Common newt', 'Great crested newt']

# Convert columns to appropriate data types
amphibians_df['ID'] = pd.to_numeric(amphibians_df['ID'], errors='coerce')
amphibians_df['SR'] = pd.to_numeric(amphibians_df['SR'], errors='coerce')
amphibians_df['NR'] = pd.to_numeric(amphibians_df['NR'], errors='coerce')
amphibians_df['OR'] = pd.to_numeric(amphibians_df['OR'], errors='coerce')
amphibians_df['RR'] = pd.to_numeric(amphibians_df['RR'], errors='coerce')
amphibians_df['BR'] = pd.to_numeric(amphibians_df['BR'], errors='coerce')

# Convert categorical columns
categorical_columns = ['MV', 'TR', 'VR', 'SUR1', 'SUR2', 'SUR3', 'UR', 'FR', 'MR', 'CR', 
                       'Green frogs', 'Brown frogs', 'Common toad', 'Fire-bellied toad', 
                       'Tree frog', 'Common newt', 'Great crested newt']

numerical_columns = ['ID', 'SR', 'NR', 'OR', 'RR', 'BR']

for col in categorical_columns:
    amphibians_df[col] = amphibians_df[col].astype('category')

## Exploratory Data Analysis (EDA)

### Summary Statistics

In [None]:
amphibians_df.describe()

### Missing Values

In [None]:
missing_values = amphibians_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
logger.info("\nColumns with Missing Values:")
logger.info(missing_values)

#### Distribution of Categorical Columns

In [None]:
plt.figure(figsize=(16, 12))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(3, 2, i)
    sns.histplot(amphibians_df[column], bins=20, kde=True, color='blue')
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

### Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 8))
correlation_matrix = amphibians_df[numerical_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

### Pair Plot for Numerical Columns

In [None]:
sns.pairplot(amphibians_df[numerical_columns], diag_kind='kde', plot_kws={'alpha': 0.7})
plt.suptitle('Pair Plot of Selected Numerical Features', y=1.02)
plt.show()

### Count plot for Categorical MV column

In [None]:
# 4. Count plot of a key categorical column (e.g., 'MV')
plt.figure(figsize=(10, 6))
sns.countplot(data=amphibians_df, x='MV', hue='MV', palette='viridis', legend=False)  # Assign hue to x and set legend=False
plt.title('Count Plot of Motorway (MV) Categories')
plt.xticks(rotation=45)
plt.show()

### Box Plot for Numerical vs. Categorical

In [None]:
# 5. Box Plot for Numerical vs. Categorical
plt.figure(figsize=(12, 6))
sns.boxplot(x='MV', y='SR', data=amphibians_df, hue='MV', palette='Set2', legend=False)  # Assign hue to x
plt.title('Box Plot of Speed Rating (SR) vs. Motorway (MV)')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 6. Count plots for Labels
plt.figure(figsize=(16, 10))
for i, label in enumerate(['Green frogs', 'Brown frogs', 'Common toad', 'Fire-bellied toad', 'Tree frog', 
                           'Common newt', 'Great crested newt'], 1):
    plt.subplot(4, 2, i)
    sns.countplot(x=label, data=amphibians_df, hue=label, legend=False)  # Assign hue to x
    plt.title(f'Count of {label}')
    plt.xlabel(label)
plt.tight_layout()
plt.show()
