<a href="https://colab.research.google.com/github/NEPatriots-Coder/Colab-Notebooks/blob/main/Copy_of_InsuranceAssesment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Prepare the Kaggle Medical Cost Personal Dataset for risk analysis
# Dataset columns: age, sex, bmi, children, smoker, region, charges

def prepared_data(insurance_data):
  df = insurance_data.copy()

  charge_threshold = df['charges'].quantile(0.75)
  df['risk_level'] = (df['charges'] > charge_threshold).astype(int)



  return df




In [None]:
class EnhancedInsuranceRiskScorer:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.scaler = StandardScaler()
        self.feature_columns = None

    def preprocess_data(self, data):
        """Preprocess insurance data from Kaggle dataset for risk scoring."""
        # Convert categorical variables to numeric
        data['sex'] = data['sex'].map({'male': 0, 'female': 1})
        data['smoker'] = data['smoker'].map({'no': 0, 'yes': 1})

        # Create age groups and BMI categories for risk factors
        data['age_group'] = pd.cut(data['age'], bins=[0, 25, 35, 50, 65, 100],
                                 labels=['0-25', '26-35', '36-50', '51-65', '65+'])
        data['bmi_category'] = pd.cut(data['bmi'], bins=[0, 18.5, 25, 30, 100],
                                    labels=['underweight', 'normal', 'overweight', 'obese'])

        # One-hot encode categorical variables
        data = pd.get_dummies(data, columns=['age_group', 'bmi_category', 'region'])

        if self.feature_columns is None:
            # Store feature columns during training
            self.feature_columns = [col for col in data.columns
                                  if col not in ['charges', 'risk_level']]

        return data

    def train(self, data):
        """Train the risk scoring model."""
        # Preprocess the data
        processed_data = self.preprocess_data(data)

        # Define features and target
        features = processed_data[self.feature_columns]
        target = processed_data['risk_level']

        # Scale the features
        self.scaler.fit(features)
        X_scaled = self.scaler.transform(features)

        # Split the data and train the model
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, target,
                                                          test_size=0.2,
                                                          random_state=42)
        self.model.fit(X_train, y_train)

        # Calculate and print model performance metrics
        train_accuracy = self.model.score(X_train, y_train)
        test_accuracy = self.model.score(X_test, y_test)

        print("\nModel Performance Metrics:")
        print(f"Training Accuracy: {train_accuracy:.2f}")
        print(f"Testing Accuracy: {test_accuracy:.2f}")

        # Print feature importance analysis
        self.analyze_feature_importance()

        return test_accuracy

    def predict_risk(self, new_data):
        """Predict risk levels for new insurance applications."""
        # Preprocess new data
        processed_data = self.preprocess_data(new_data)

        # Ensure all feature columns exist
        for col in self.feature_columns:
            if col not in processed_data.columns:
                processed_data[col] = 0

        # Select and order features
        features = processed_data[self.feature_columns]

        # Scale features and predict
        X_scaled = self.scaler.transform(features)
        risk_predictions = self.model.predict(X_scaled)
        risk_probabilities = self.model.predict_proba(X_scaled)

        return risk_predictions, risk_probabilities

    def analyze_feature_importance(self):
        """Analyze and display the importance of each feature in risk prediction."""
        feature_importance = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': self.model.feature_importances_
        })
        feature_importance = feature_importance.sort_values('importance',
                                                          ascending=False)

        print("\nTop 10 Most Important Risk Factors:")
        print(feature_importance.head(10))

        return feature_importance

In [None]:
def main():

    insurance_data = pd.read_csv("/content/drive/MyDrive/DataCSV's/insurance.csv")

    # Prepare the data
    prepared_data = prepare_data(insurance_data)

    # Initialize and train the model
    risk_scorer = EnhancedInsuranceRiskScorer()
    risk_scorer.train(prepared_data)

    # Example: Make predictions for new applications
    new_applications = pd.DataFrame({
        'age': [30, 50],
        'sex': ['female', 'male'],
        'bmi': [23.5, 29.8],
        'children': [1, 2],
        'smoker': ['no', 'yes'],
        'region': ['southwest', 'northeast'],
        'charges': [0, 0]  # Charges would be unknown for new applications
    })

    predictions, probabilities = risk_scorer.predict_risk(new_applications)

    print("\nRisk Predictions for New Applications:")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        risk_level = "High Risk" if pred == 1 else "Low Risk"
        risk_probability = prob[1]  # Probability of being high risk
        print(f"\nApplication {i+1}:")
        print(f"Risk Level: {risk_level}")
        print(f"Probability of High Risk: {risk_probability:.2%}")

if __name__ == "__main__":
    main()




Model Performance Metrics:
Training Accuracy: 1.00
Testing Accuracy: 0.93

Top 10 Most Important Risk Factors:
               feature  importance
4               smoker    0.567449
2                  bmi    0.148945
0                  age    0.111245
3             children    0.048266
1                  sex    0.021038
16    region_southeast    0.012725
14    region_northeast    0.012556
15    region_northwest    0.009838
13  bmi_category_obese    0.009343
5       age_group_0-25    0.009215

Risk Predictions for New Applications:

Application 1:
Risk Level: Low Risk
Probability of High Risk: 2.00%

Application 2:
Risk Level: High Risk
Probability of High Risk: 98.00%
