In [4]:
# Notebook: ACIS Insurance Analytics - Data Preprocessing
# File: notebooks/01_eda.ipynb

# --- CELL 1: Setup and Imports ---
print("Setting up environment...")

import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 50)

# Get the current directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Check directory structure
print("\nChecking directory structure:")
for item in os.listdir('.'):
    if os.path.isdir(item):
        print(f"  üìÅ {item}/")
    else:
        print(f"  üìÑ {item}")

# Add src directory to Python path
src_path = os.path.join(current_dir, 'src')
if os.path.exists(src_path):
    sys.path.insert(0, src_path)
    print(f"\n‚úì Added src directory to Python path: {src_path}")
else:
    # Check if we're in notebooks directory
    parent_dir = os.path.dirname(current_dir)
    src_path = os.path.join(parent_dir, 'src')
    if os.path.exists(src_path):
        sys.path.insert(0, src_path)
        print(f"\n‚úì Added src directory to Python path: {src_path}")
    else:
        print(f"\n‚úó src directory not found. Creating it...")
        os.makedirs('src', exist_ok=True)
        sys.path.insert(0, os.path.abspath('src'))
        print("‚úì Created and added src directory")

print("‚úì Environment setup complete")

Setting up environment...
Current working directory: d:\Python\Week-3\Insurance-Analytics-Week-3-\notebooks

Checking directory structure:
  üìÑ eda_analysis.ipynb
  üìÑ statistical_analysis.ipynb

‚úì Added src directory to Python path: d:\Python\Week-3\Insurance-Analytics-Week-3-\src
‚úì Environment setup complete


In [5]:
# --- CELL 2: Create the data_preprocessing.py file directly in the notebook ---
# If the module file doesn't exist, we'll create it

src_dir = 'src'
if not os.path.exists(src_dir):
    os.makedirs(src_dir)

data_preprocessing_code = '''
"""
Data Preprocessing Module for ACIS Insurance Analytics
"""

import pandas as pd
import numpy as np
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

class InsuranceDataPreprocessor:
    def __init__(self, file_path=None):
        self.file_path = file_path
        self.df = None
        self.raw_df = None
        self.metadata = {}
        self.categorical_cols = []
        self.numerical_cols = []
        
    def load_data(self, file_path=None):
        if file_path:
            self.file_path = file_path
            
        if self.file_path is None:
            self.file_path = 'data/raw/MachineLearningRating_v3.txt'
            
        print(f"Loading data from: {self.file_path}")
        
        try:
            if not os.path.exists(self.file_path):
                print(f"File not found. Checking alternative locations...")
                # Try to find the file
                possible_locations = [
                    self.file_path,
                    os.path.join('..', self.file_path),
                    os.path.join('../..', self.file_path),
                    'MachineLearningRating_v3.txt',
                    os.path.join('data', 'raw', 'MachineLearningRating_v3.txt')
                ]
                
                for location in possible_locations:
                    if os.path.exists(location):
                        self.file_path = location
                        print(f"Found file at: {location}")
                        break
                else:
                    raise FileNotFoundError("Could not find data file")
            
            # Try to read the file
            try:
                self.raw_df = pd.read_csv(self.file_path, sep='\\t', encoding='utf-8')
            except:
                try:
                    self.raw_df = pd.read_csv(self.file_path, sep=',', encoding='utf-8')
                except:
                    self.raw_df = pd.read_csv(self.file_path, encoding='utf-8')
            
            self.df = self.raw_df.copy()
            print(f"‚úì Data loaded: {self.df.shape[0]} rows, {self.df.shape[1]} columns")
            return self.df
            
        except Exception as e:
            print(f"Error loading data: {e}")
            return None
    
    def get_data(self):
        return self.df
    
    def get_summary(self):
        if self.df is None:
            return "No data loaded"
        
        summary = {
            'shape': self.df.shape,
            'columns': list(self.df.columns),
            'dtypes': dict(self.df.dtypes),
            'missing_values': self.df.isnull().sum().sum(),
            'memory_usage_mb': self.df.memory_usage(deep=True).sum() / 1024**2
        }
        return summary
'''

# Write the module file
module_path = os.path.join(src_dir, 'data_preprocessing.py')
with open(module_path, 'w') as f:
    f.write(data_preprocessing_code)

print(f"‚úì Created data_preprocessing.py at: {module_path}")

UnicodeEncodeError: 'charmap' codec can't encode character '\u2713' in position 2062: character maps to <undefined>

In [None]:
# --- CELL 3: Import and test the module ---
print("Testing the module import...")

try:
    from data_preprocessing import InsuranceDataPreprocessor
    print("‚úì Successfully imported InsuranceDataPreprocessor")
    
    # Test creating an instance
    preprocessor = InsuranceDataPreprocessor()
    print("‚úì Created preprocessor instance")
    
except Exception as e:
    print(f"‚úó Error importing module: {e}")
    print("\nTrying alternative import method...")
    
    # Alternative: import directly from file
    import importlib.util
    spec = importlib.util.spec_from_file_location("data_preprocessing", module_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    InsuranceDataPreprocessor = module.InsuranceDataPreprocessor
    preprocessor = InsuranceDataPreprocessor()
    print("‚úì Imported module directly from file")

In [None]:
# --- CELL 4: Load and explore the data ---
print("Loading data...")

# First, let's check if we can find the data file
import glob

# Search for the data file
data_files = glob.glob('**/MachineLearningRating*.txt', recursive=True)
data_files += glob.glob('**/MachineLearningRating*.csv', recursive=True)

if data_files:
    print(f"Found data files: {data_files}")
    data_path = data_files[0]
else:
    print("Could not find data file. Please update the path.")
    # You'll need to update this path based on where your data is
    data_path = '../data/raw/MachineLearningRating_v3.txt'

# Load the data
df = preprocessor.load_data(data_path)

if df is not None:
    print("\nData loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(df.head())
    
    # Display data types
    print("\nData types:")
    print(df.dtypes)
else:
    print("Failed to load data. Please check the file path.")