In [1]:
import pandas as pd
import numpy as np
import pickle
import mlflow.sklearn
import mlflow.tracking
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import logging
import warnings
warnings.filterwarnings('ignore')

In [4]:

test_data = pd.read_csv('test.csv')

In [5]:
test_data.isnull().sum()

id                           0
Age                      12489
Gender                       0
Annual Income            29860
Marital Status           12336
Number of Dependents     73130
Education Level              0
Occupation              239125
Health Score             49449
Location                     0
Policy Type                  0
Previous Claims         242802
Vehicle Age                  3
Credit Score             91451
Insurance Duration           2
Policy Start Date            0
Customer Feedback        52276
Smoking Status               0
Exercise Frequency           0
Property Type                0
dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')


class DataPreprocessor:
    def __init__(self):
        self.label_encoders = {}
        self.one_hot_encoder = OneHotEncoder(sparse_output=False)
        

    def preprocess_data(self, df):
        
        numerical_cols = [
            'Age', 'Annual Income', 'Health Score', 
            'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Previous Claims'
        ]
        
        categorical_cols = [
            'Marital Status', 'Number of Dependents', 'Occupation', 'Customer Feedback'
        ]
        
        # Fill missing numerical values with mean
        for col in numerical_cols:
            df[col].fillna(df[col].mean(), inplace=True)
        
        # Fill missing categorical values with mode
        for col in categorical_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)
        
        # Define categorical mappings
        mappings = {
            "Education Level": {
                "High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3
            },
            "Customer Feedback": {
                'Poor': 0, 'Average': 1, 'Good': 2
            },
            "Exercise Frequency": {
                'Rarely': 0, 'Weekly': 1, 'Monthly': 2, 'Daily': 3
            },
        }
        
        # Apply categorical mappings
        for col, mapping in mappings.items():
            df[col] = df[col].map(mapping)

        return df

    def label_encode(self, df, columns):
        for col in columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.label_encoders[col] = le
        return df

    def one_hot_encode(self, df, columns):
        one_hot_encoded = self.one_hot_encoder.fit_transform(df[columns])
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.one_hot_encoder.get_feature_names_out(columns))
        df = pd.concat([df, one_hot_df], axis=1)
        df = df.drop(columns, axis=1)
        return df
    

# Load the data
df = pd.read_csv('test.csv')

# Initialize the preprocessor
preprocessor = DataPreprocessor()

# Preprocess the data
df = preprocessor.preprocess_data(df)
df.drop(['id', 'Policy Start Date'], axis=1, inplace=True)
df = preprocessor.label_encode(df, ['Gender', 'Smoking Status', 'Marital Status', 'Policy Type'])
df_encoded = preprocessor.one_hot_encode(df, df.select_dtypes(include=['object']).columns.tolist())


In [7]:
df_encoded.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,...,Exercise Frequency,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Location_Rural,Location_Suburban,Location_Urban,Property Type_Apartment,Property Type_Condo,Property Type_House
0,28.0,0,2310.0,2,4.0,1,7.657981,0,1.004873,19.0,...,1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,31.0,0,126031.0,1,2.0,2,13.381379,2,1.004873,14.0,...,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,47.0,0,17092.0,0,0.0,3,24.354527,1,1.004873,16.0,...,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,28.0,0,30424.0,0,3.0,3,5.136225,1,1.0,3.0,...,3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,24.0,1,10863.0,0,2.0,0,11.844155,2,1.004873,14.0,...,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [8]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

    return df

In [9]:
df_treated = remove_outliers_iqr(df_encoded, df_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist())
df_treated.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,...,Exercise Frequency,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Location_Rural,Location_Suburban,Location_Urban,Property Type_Apartment,Property Type_Condo,Property Type_House
0,28.0,0,2310.0,2,4.0,1,7.657981,0,1.004873,19.0,...,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,31.0,0,96827.125,1,2.0,2,13.381379,2,1.004873,14.0,...,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,47.0,0,17092.0,0,0.0,3,24.354527,1,1.004873,16.0,...,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,28.0,0,30424.0,0,3.0,3,5.136225,1,1.0,3.0,...,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,24.0,1,10863.0,0,2.0,0,11.844155,2,1.004873,14.0,...,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [11]:
def log_transform(data, columns_to_transform):
    for col in columns_to_transform:
        data[f'{col}_log'] = np.log1p(data[col])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True)  
    
    return data

In [12]:
transformed_data = log_transform(df_treated, ['Annual Income'])

In [13]:
def scaling(data, columns_to_transform):
    scale = MinMaxScaler()
    for col in columns_to_transform:
        data[f'{col}_log'] = scale.fit_transform(data[[col]])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True)

In [14]:
scaled_data = scaling(transformed_data, ['Annual Income'])

In [15]:
with open(r"C:\Users\gowth\Desktop\B27_DS\Insurance Premium\best_model.pkl", "rb") as file:
    model = pickle.load(file)

In [16]:
predictions = model.predict(scaled_data)

ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.