# Data Loading

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset from the provided link
dataset_url = "https://github.com/dsrscientist/dataset4/raw/main/medical_cost_insurance.csv"
insurance_data = pd.read_csv(dataset_url)

# Display the first few rows of the dataset to understand its structure
print(insurance_data.head())

# Get information about the dataset, including data types and missing values
print(insurance_data.info())

# Summary statistics for numerical columns
print(insurance_data.describe())

# Check for missing values
print(insurance_data.isnull().sum())


# Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for plots
sns.set(style="whitegrid")

# Distribution of charges
plt.figure(figsize=(12, 6))
sns.histplot(insurance_data['charges'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Insurance Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.show()

# Boxplot of charges by smoker status
plt.figure(figsize=(12, 6))
sns.boxplot(x='smoker', y='charges', data=insurance_data, palette='coolwarm')
plt.title('Insurance Charges by Smoker Status')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.show()

# Pairplot to visualize relationships between numerical features
sns.pairplot(insurance_data[['age', 'bmi', 'children', 'charges']])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(insurance_data.corr(), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()


# Data Preprocessing

In [None]:
# Handle missing values (if any)
# Check for missing values
print("Missing values before handling:")
print(insurance_data.isnull().sum())

# If there are missing values, you can choose to fill them with mean, median, or other strategies
# For example, filling missing values in 'bmi' with the mean
insurance_data['bmi'].fillna(insurance_data['bmi'].mean(), inplace=True)

# Verify that missing values have been handled
print("\nMissing values after handling:")
print(insurance_data.isnull().sum())

# Encode categorical variables using one-hot encoding
insurance_data = pd.get_dummies(insurance_data, columns=['sex', 'smoker', 'region'], drop_first=True)

# Scale numerical features (if necessary)
from sklearn.preprocessing import StandardScaler

# Select numerical features to scale
numerical_features = ['age', 'bmi', 'children']

# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the selected numerical features
insurance_data[numerical_features] = scaler.fit_transform(insurance_data[numerical_features])

# Display the preprocessed dataset
print("\nPreprocessed dataset:")
print(insurance_data.head())


# Feature Engineering

In [None]:
# Feature Engineering: Create new features or transformations

# Interaction term between age and smoker status
insurance_data['age_smoker_interaction'] = insurance_data['age'] * insurance_data['smoker_yes']

# Square of BMI
insurance_data['bmi_squared'] = insurance_data['bmi']**2

# Log-transform of charges (if appropriate, considering it's a regression problem)
insurance_data['log_charges'] = np.log1p(insurance_data['charges'])

# Display the dataset after feature engineering
print("Dataset after feature engineering:")
print(insurance_data.head())


# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = insurance_data.drop(['charges'], axis=1)
y = insurance_data['charges']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


# Model Selection

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
linear_reg_predictions = linear_reg_model.predict(X_test)

# Decision Tree
decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)
decision_tree_predictions = decision_tree_model.predict(X_test)

# Random Forest
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)

# Evaluate models
def evaluate_model(predictions, y_true):
    mae = mean_absolute_error(y_true, predictions)
    mse = mean_squared_error(y_true, predictions)
    r2 = r2_score(y_true, predictions)
    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'R-squared: {r2:.2f}')

print("Linear Regression Model:")
evaluate_model(linear_reg_predictions, y_test)
print("\nDecision Tree Model:")
evaluate_model(decision_tree_predictions, y_test)
print("\nRandom Forest Model:")
evaluate_model(random_forest_predictions, y_test)


# Model Training

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Decision Tree
decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)

# Random Forest
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)


# Model Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to evaluate a model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'R-squared: {r2:.2f}')

# Evaluate Linear Regression Model
print("Linear Regression Model Evaluation:")
evaluate_model(linear_reg_model, X_test, y_test)

# Evaluate Decision Tree Model
print("\nDecision Tree Model Evaluation:")
evaluate_model(decision_tree_model, X_test, y_test)

# Evaluate Random Forest Model
print("\nRandom Forest Model Evaluation:")
evaluate_model(random_forest_model, X_test, y_test)


# Prediction

In [None]:
# Assuming you have new data in a DataFrame called 'new_data'
# Ensure that 'new_data' has the same structure (features) as your training data

# Use the trained Random Forest model to make predictions
new_data_predictions = random_forest_model.predict(new_data)

# Display the predictions
print("Predictions for New Data:")
print(new_data_predictions)
