## 1)Importing Libraries & Data

dataset Link :- https://www.kaggle.com/datasets/mirichoi0218/insurance


In [2]:
import pandas as pd
import numpy as np

url='https://raw.githubusercontent.com/aniruddha7599/DAIICT-/refs/heads/main/insurance.csv'

df=pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.dtypes

Unnamed: 0,0
age,int64
sex,object
bmi,float64
children,int64
smoker,object
region,object
charges,float64


In [4]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


##Since there are no missing values no need to treat missing values

# 2)Outlier Detecting

In [6]:
def outlier_iqr(df):
    total_outliers = 0
    outlier_summary = {}
    numerical_columns = ['age', 'bmi', 'children', 'charges']
    # Iterate over each numeric column
    for col in numerical_columns:
        q25, q75 = np.quantile(df[col], 0.25), np.quantile(df[col], 0.75)
        iqr = q75 - q25
        cut_off = iqr * 1.5
        lower, upper = q25 - cut_off, q75 + cut_off

        # Find outliers below and above
        outliers_below = df[df[col] < lower].shape[0]
        outliers_above = df[df[col] > upper].shape[0]

        # Total outliers for this column
        total_outliers_for_col = outliers_below + outliers_above
        total_outliers += total_outliers_for_col

        # Store the outlier count for this column
        outlier_summary[col] = total_outliers_for_col

        # Print details for each column

    print(f'Total outliers across all columns: {total_outliers}')
    return outlier_summary
outlier_iqr(df)

Total outliers across all columns: 148


{'age': 0, 'bmi': 9, 'children': 0, 'charges': 139}

##Outlier Treatment using IQR

In [7]:
def remove_outliers_iqr(df):
    numerical_columns = ['age', 'bmi', 'children', 'charges']
    for col in numerical_columns:
        q25, q75 = np.quantile(df[col], 0.25), np.quantile(df[col], 0.75)
        iqr = q75 - q25
        cut_off = iqr * 1.5
        lower, upper = q25 - cut_off, q75 + cut_off

        # Remove rows with outliers
        df = df[(df[col] >= lower) & (df[col] <= upper)]

    return df

# Apply the function to remove outliers
df_cleaned = remove_outliers_iqr(df)
df_cleaned.shape

(1191, 7)

In [8]:
df_cleaned.dtypes

Unnamed: 0,0
age,int64
sex,object
bmi,float64
children,int64
smoker,object
region,object
charges,float64


In [9]:
outlier_iqr(df_cleaned)

Total outliers across all columns: 60


{'age': 0, 'bmi': 7, 'children': 0, 'charges': 53}

In [10]:
df_cleaned = remove_outliers_iqr(df_cleaned)
outlier_iqr(df_cleaned)

Total outliers across all columns: 20


{'age': 0, 'bmi': 0, 'children': 0, 'charges': 20}

In [11]:
df_cleaned = remove_outliers_iqr(df_cleaned)
outlier_iqr(df_cleaned)

Total outliers across all columns: 6


{'age': 0, 'bmi': 0, 'children': 0, 'charges': 6}

In [12]:
df_cleaned = remove_outliers_iqr(df_cleaned)
outlier_iqr(df_cleaned)

Total outliers across all columns: 0


{'age': 0, 'bmi': 0, 'children': 0, 'charges': 0}

In [13]:
# @title Numerical Feature Scaling using sklearn
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_cleaned[['age', 'bmi', 'children']] = scaler.fit_transform(
    df_cleaned[['age', 'bmi', 'children']])


In [14]:
# @title Feature Selection using selectkbest and f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error

# Assuming df_cleaned is your preprocessed DataFrame

# Define features (X) and target (y)
X = df_cleaned[['age', 'bmi', 'children']]  # Features
y = df_cleaned['charges']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SelectKBest to select the top k features
k = 3  # Number of top features you want to select
selector = SelectKBest(score_func=f_regression, k=k)

# Fit the selector to the training data
X_train_selected = selector.fit_transform(X_train, y_train)

# Transform the test set to keep only the selected features
X_test_selected = selector.transform(X_test)

# Print the selected feature names
selected_features = X.columns[selector.get_support()]
print(f"Selected features: {selected_features.tolist()}")

# Initialize and fit the regression model
model = LinearRegression()
model.fit(X_train_selected, y_train)

# Make predictions
predictions = model.predict(X_test_selected)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Selected features: ['age', 'bmi', 'children']
Mean Squared Error: 20846598.562740713


In [15]:
# @title PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import pandas as pd

# Assuming df_cleaned is your preprocessed DataFrame

# Define features (X) and target (y)
X = df_cleaned[['age', 'bmi', 'children']]  # Features
y = df_cleaned['charges']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize PCA for feature reduction
n_components = 3  # Number of principal components you want to keep
pca = PCA(n_components=n_components)

# Fit PCA on the training data and transform both train and test sets
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Print the explained variance ratio for each component
print(f"Explained variance ratio by each component: {pca.explained_variance_ratio_}")

# Initialize and fit the regression model
model = LinearRegression()
model.fit(X_train_pca, y_train)

# Make predictions
predictions = model.predict(X_test_pca)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Explained variance ratio by each component: [0.37999397 0.33316148 0.28684455]
Mean Squared Error: 20846598.562740713


In [16]:
# @title RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error

# Assuming df_cleaned is your preprocessed DataFrame

# Define features (X) and target (y)
X = df_cleaned[['age', 'bmi', 'children']]  # Features
y = df_cleaned['charges']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the regression model
model = LinearRegression()

# Initialize RFE to select the top k features
k = 3  # Number of top features you want to select
rfe = RFE(estimator=model, n_features_to_select=k)

# Fit RFE on the training data
X_train_rfe = rfe.fit_transform(X_train, y_train)

# Transform the test set to keep only the selected features
X_test_rfe = rfe.transform(X_test)

# Print the selected feature names
selected_features = X.columns[rfe.get_support()]
print(f"Selected features: {selected_features.tolist()}")

# Fit the regression model using the selected features
model.fit(X_train_rfe, y_train)

# Make predictions
predictions = model.predict(X_test_rfe)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Selected features: ['age', 'bmi', 'children']
Mean Squared Error: 20846598.562740713


In [17]:
# @title Feature Importance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error

# Assuming df_cleaned is your preprocessed DataFrame

# Define features (X) and target (y)
X = df_cleaned[['age', 'bmi', 'children']]  # Features
y = df_cleaned['charges']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Extra Trees Regressor
model = ExtraTreesRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Use SelectFromModel to select important features based on the fitted model
selector = SelectFromModel(model, threshold="mean")
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Print the selected feature names
selected_features = X.columns[selector.get_support()]
print(f"Selected features: {selected_features.tolist()}")

# Fit the regression model using the selected features
model.fit(X_train_selected, y_train)

# Make predictions
predictions = model.predict(X_test_selected)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Selected features: ['age', 'bmi']
Mean Squared Error: 32634129.98330079
