<a href="https://colab.research.google.com/github/Manshrishevde/linear-regression/blob/main/Linear_Regression_for_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Load your data (replace 'your_data.csv' with your actual file)
data = pd.read_csv('/content/new_insurance_data (1).csv')


In [None]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,Claim_Amount,past_consultations,num_of_steps,Hospital_expenditure,NUmber_of_past_hospitalizations,Anual_Salary,region,charges
0,18.0,male,23.21,0.0,no,29087.54313,17.0,715428.0,4720920.992,0.0,55784970.05,southeast,1121.8739
1,18.0,male,30.14,0.0,no,39053.67437,7.0,699157.0,4329831.676,0.0,13700885.19,southeast,1131.5066
2,18.0,male,33.33,0.0,no,39023.62759,19.0,702341.0,6884860.774,0.0,73523107.27,southeast,1135.9407
3,18.0,male,33.66,0.0,no,28185.39332,11.0,700250.0,4274773.55,0.0,75819679.6,southeast,1136.3994
4,18.0,male,34.1,0.0,no,14697.85941,16.0,711584.0,3787293.921,0.0,23012320.01,southeast,1137.011


In [None]:
data.shape

(1338, 13)

In [None]:
data.size

17394

In [None]:
#missing values
for i in data.columns:
    if data[i].dtypes == "object":
        data[i] = data[i].fillna(data[i].mode()[0])
    else:
        data[i] = data[i].fillna(data[i].mean())

In [None]:
# Remove outliers
def remove_outliers(column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] > lower_bound) & (data[column] < upper_bound)]

columns_to_check = ['bmi', 'Hospital_expenditure', 'Anual_Salary', 'charges']
for column in columns_to_check:
    data = remove_outliers(column)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in data.columns:
    if (data[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = data[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

                          features        VIF
0                              age  15.452095
1                              bmi  26.330788
2                         children   2.029618
3                     Claim_Amount   5.678660
4               past_consultations   6.258017
5                     num_of_steps  61.574692
6             Hospital_expenditure   5.204376
7  NUmber_of_past_hospitalizations  12.052060
8                     Anual_Salary   5.481823


In [None]:
data.drop(columns="bmi" , inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in data.columns:
    if (data[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = data[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

                          features        VIF
0                              age  15.353726
1                         children   2.029618
2                     Claim_Amount   5.678659
3               past_consultations   6.256382
4                     num_of_steps  28.672704
5             Hospital_expenditure   5.202946
6  NUmber_of_past_hospitalizations  11.660111
7                     Anual_Salary   5.158081


In [None]:
data.drop(columns="num_of_steps" , inplace = True)

In [None]:
data.drop(columns="age" , inplace = True)

In [None]:
data.drop(columns="NUmber_of_past_hospitalizations" , inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = []
for col in data.columns:
    if (data[col].dtypes != "object") & (col !="charges"):
        col_list.append(col)

x = data[col_list]
vif_data = pd.DataFrame()
vif_data['features'] = x.columns
vif_data['VIF'] = [variance_inflation_factor(x.values , i) for i in range(len(x.columns))]
print(vif_data)

               features       VIF
0              children  1.713101
1          Claim_Amount  4.152224
2    past_consultations  4.670212
3  Hospital_expenditure  4.320876
4          Anual_Salary  4.279483


In [None]:
data.columns

Index(['sex', 'children', 'smoker', 'Claim_Amount', 'past_consultations',
       'Hospital_expenditure', 'Anual_Salary', 'region', 'charges'],
      dtype='object')

In [None]:
#define X and Y
X = data.drop(columns=['sex', 'smoker', 'region', 'charges'])
y = data['charges']

In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

In [None]:
#Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_percentage_error

In [None]:
#standard scalar
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
#categorical
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

In [None]:
# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluate the model (example using R-squared and MAPE)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"R-squared: {r2}")
print(f"MAPE: {mape}")

R-squared: 0.8536798102869023
MAPE: 0.31717987528096186
