In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import logging
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Configure logging
logging.basicConfig(
    filename='healthcare_data_validation.log',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Load the healthcare dataset
try:
    df = pd.read_csv('healthcare_data.csv')  # Ensure this file exists in your working directory
    logging.info("Healthcare dataset loaded successfully.")
except Exception as e:
    logging.error(f"Error loading healthcare dataset: {e}")
    raise

# Define validation rules
def validate_data(data):
    issues = {}

    # Rule 1: Check for missing values
    missing_values = data.isnull().sum()
    issues['missing_values'] = missing_values[missing_values > 0].to_dict()

    # Rule 2: Check for duplicate records
    duplicate_count = data.duplicated().sum()
    issues['duplicate_records'] = duplicate_count

    # Rule 3: Check for out-of-range values (example: age should be between 0 and 120)
    if 'age' in data.columns:
        out_of_range_age = data[(data['age'] < 0) | (data['age'] > 120)].shape[0]
        issues['out_of_range_age'] = out_of_range_age

    # Rule 4: Check for inconsistent categorical entries (example: gender should be 'Male' or 'Female')
    if 'gender' in data.columns:
        valid_genders = ['Male', 'Female']
        invalid_genders = data[~data['gender'].isin(valid_genders)].shape[0]
        issues['invalid_genders'] = invalid_genders

    return issues

# Perform data validation
validation_issues = validate_data(df)
logging.info(f"Data Validation Issues: {validation_issues}")

# Handle data quality issues
# Drop duplicate records
df = df.drop_duplicates()
logging.info("Duplicate records removed.")

# Handle missing values (example: drop rows with missing values)
df = df.dropna()
logging.info("Rows with missing values removed.")

# Remove out-of-range age values
if 'age' in df.columns:
    df = df[(df['age'] >= 0) & (df['age'] <= 120)]
    logging.info("Out-of-range age values removed.")

# Filter valid gender entries
if 'gender' in df.columns:
    df = df[df['gender'].isin(['Male', 'Female'])]
    logging.info("Invalid gender entries removed.")

# Encode categorical variables
if 'gender' in df.columns:
    df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# Feature selection (example: selecting relevant features for prediction)
# Assume 'diagnosis', 'age', 'gender' are features and 'outcome' is the target variable
features = ['diagnosis', 'age', 'gender']
target = 'outcome'

# Ensure all required columns are present
for col in features + [target]:
    if col not in df.columns:
        logging.error(f"Required column '{col}' is missing from the dataset.")
        raise ValueError(f"Required column '{col}' is missing from the dataset.")

X = df[features]
y = df[target]

# Handle categorical variables in features
X = pd.get_dummies(X, columns=['diagnosis'], drop_first=True)

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model for future use
joblib.dump(model, 'healthcare_prediction_model.pkl')

# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
logging.info(f"Model Evaluation Report:\n{json.dumps(report, indent=2)}")

# Visualize feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

ERROR:root:Error loading healthcare dataset: [Errno 2] No such file or directory: 'healthcare_data.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'healthcare_data.csv'