# U.S. Medical Insurance Costs

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Specify the file path
file_path = 'insurance.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display basic information about the DataFrame
print("Head of the DataFrame:")
print(df.head())  # Display the first few rows

print("\nInfo about the DataFrame:")
print(df.info())  # Display information about the DataFrame, including data types and non-null values

print("\nSummary statistics:")
print(df.describe())  # Display summary statistics for numerical columns

# Extract features (columns) as variables
age = df['age']
sex = df['sex']
bmi = df['bmi']
children = df['children']
smoker = df['smoker']
region = df['region']
charges = df['charges']

# Now you can use these variables for analysis
print("Age:", age)
print("Sex:", sex)
print("BMI:", bmi)
print("Children:", children)
print("Smoker:", smoker)
print("Region:", region)
print("Charges:", charges)

def read_data(file_path):
    return pd.read_csv(file_path)

def display_summary_statistics(df):
    return df.describe()

def analyze_average_charges_by_smoker(df):
    return df.groupby('smoker')['charges'].mean()

def find_most_influential_features(df):
    # Select numeric columns for correlation analysis
    numeric_columns = df.select_dtypes(include=['number']).columns

    # Calculate correlation matrix for numeric columns
    correlation_matrix = df[numeric_columns].corr()

    # Sort features by their correlation with 'charges'
    influential_features = correlation_matrix['charges'].abs().sort_values(ascending=False).index[1:]
    
    return list(influential_features)

def make_predictions(df):
    # Select features and target variable
    X = df[['age', 'bmi', 'children']]
    y = df['charges']

    # Train a linear regression model on the entire dataset
    model = LinearRegression()
    model.fit(X, y)

    # Make predictions on the entire dataset
    predictions = model.predict(X)

    return predictions  # Return predictions for the entire dataset

def explore_bias(df):
    # Example: Analyzing bias based on 'sex' and 'region'
    bias_analysis = df.groupby(['sex', 'region']).size().unstack(fill_value=0)
    return bias_analysis

def calculate_cost(y_true, y_pred):
    # Calculate Mean Squared Error as the cost
    mse = mean_squared_error(y_true, y_pred)
    return mse

# Continue with additional analysis
# Read data
insurance_data = read_data(file_path)

# Display summary statistics
summary_stats = display_summary_statistics(insurance_data)
print("\nSummary Statistics:")
print(summary_stats)

# Analyze average charges by smoker status
average_charges_by_smoker = analyze_average_charges_by_smoker(insurance_data)
print("\nAverage Charges by Smoker Status:")
print(average_charges_by_smoker)

# Find most influential features
influential_features = find_most_influential_features(insurance_data)
print("\nMost Influential Features:")
print(influential_features)

# Make predictions
y_pred = make_predictions(insurance_data)
print("\nPredictions:")
print(y_pred)

# Explore bias
bias_analysis = explore_bias(insurance_data)
print("\nBias Analysis:")
print(bias_analysis)

# Calculate cost
y_true = insurance_data['charges']
cost = calculate_cost(y_true, y_pred)
print("\nCost (Mean Squared Error):")
print(cost)


Head of the DataFrame:
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

Info about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None

Summary statistics:
               age  