In [1]:
import pandas as pd
import numpy as np
import pickle
import mlflow.sklearn
import mlflow.tracking
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import logging
import warnings
warnings.filterwarnings('ignore')

In [2]:
test_file = 'F:\\Guvi Projects\\Smart_Premium\\playground-series-s4e12\\test.csv'
test_data = pd.read_csv(test_file)

In [3]:
test_data.drop(['id','Policy Start Date'], axis=1, inplace=True)

In [4]:
for col in test_data.select_dtypes(include=['int64', 'float64']).columns:
    test_data[col].fillna(test_data[col].mean(), inplace=True)

for col in test_data.select_dtypes(include='object').columns:
    test_data[col].fillna(test_data[col].mode()[0])

In [5]:
def age_category(data):
    if 18 < data <= 30: return '18-30'
    elif 30 < data <= 40: return '31-40'
    elif 40 < data <= 50: return '41-50'
    elif 50 < data <= 64: return '51-64'
    else: return '<64'

In [6]:
def dependent_category(data):
    if data == 0: return '0'
    elif 0 < data <= 2: return '0-2'
    elif 2 < data <= 3: return '2-3'
    else: return '<3'

In [7]:
def health_category(data):
    if 0 < data <= 15: return '0-15'
    elif 15 < data <= 25: return '15-25'
    elif 25 < data <= 35: return '15-35'
    else: return '<35'

In [8]:
def claims(data):
    if 0 < data <= 1: return '0-1'
    elif 1 < data <= 2: return '1-2'
    else: return '<2'

In [9]:
def vehicle(data):
    if 0 < data <= 5: return '0-5'
    elif 5 < data <= 10: return '5-10'
    elif 10 < data <= 20: return '10-20'
    else: return '<20'

In [10]:
def credit(data):
    if 0 < data <= 300: return '0-300'
    elif 300 < data <= 600: return '300-600'
    elif 600 < data < 800: return '600-800'
    else: return '<800'

In [11]:
def insurance(data):
    if 0 < data <= 3: return '0-3'
    elif 3 < data <= 6: return '3-6'
    elif 6 < data < 9: return '6-9'
    else: return '<9'

In [12]:
test_data['Age_Group'] = test_data['Age'].apply(age_category)
test_data['Dependent_Group'] = test_data['Number of Dependents'].apply(dependent_category)
test_data['Health_Group'] = test_data['Health Score'].apply(health_category)
test_data['Prev_Claims_Group'] = test_data['Previous Claims'].apply(claims)
test_data['Vehicle_Group'] = test_data['Vehicle Age'].apply(vehicle)
test_data['Credit_Group'] = test_data['Credit Score'].apply(credit)
test_data['Insurance_Group'] = test_data['Insurance Duration'].apply(insurance)

In [13]:
mappings = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Customer Feedback": {"Poor": 0, "Average": 1, "Good": 2},
    "Exercise Frequency": {"Rarely": 0, "Weekly": 1, "Monthly": 2, "Daily": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2}
}

In [14]:
test_data = test_data.replace(mappings).infer_objects(copy=False)

In [15]:
columns_to_encode = test_data[['Age_Group', 'Dependent_Group', 'Health_Group', 'Prev_Claims_Group',
                               'Vehicle_Group', 'Credit_Group', 'Insurance_Group', 'Gender', 'Marital Status',
                               'Occupation', 'Location', 'Smoking Status', 'Property Type']]

In [16]:
le = LabelEncoder()
for col in columns_to_encode.columns:
    test_data[col] = le.fit_transform(test_data[col])

In [17]:
encoded_test_data = pd.DataFrame({
    'Age': test_data['Age_Group'],
    'Gender': test_data['Gender'],
    'Annual Income': test_data['Annual Income'],
    'Marital Status': test_data['Marital Status'],
    'Number of Dependents': test_data['Dependent_Group'],
    'Education Level': test_data['Education Level'],
    'Occupation': test_data['Occupation'],
    'Health Score': test_data['Health_Group'],
    'Location': test_data['Location'],
    'Policy Type': test_data['Policy Type'],
    'Previous Claims': test_data['Prev_Claims_Group'],
    'Vehicle Age': test_data['Vehicle_Group'],
    'Credit Score': test_data['Credit_Group'],
    'Insurance Duration': test_data['Insurance_Group'],
    'Customer Feedback': test_data['Customer Feedback'],
    'Smoking Status': test_data['Smoking Status'],
    'Exercise Frequency': test_data['Exercise Frequency'],
    'Property Type': test_data['Property Type']
})

In [18]:
def log_transform(data, columns_to_transform):
    for col in columns_to_transform:
        data[f'{col}_log'] = np.log1p(data[col])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True)  
    
    return data

In [19]:
transformed_data = log_transform(encoded_test_data, ['Annual Income'])

In [20]:
def scaling(data, columns_to_transform):
    scale = MinMaxScaler()
    for col in columns_to_transform:
        data[f'{col}_log'] = scale.fit_transform(data[[col]])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True)  

In [21]:
scaled_data = scaling(transformed_data, ['Annual Income'])

In [22]:
with open("F:\\Guvi Projects\\Smart_Premium\\pickles\\best_model.pkl", "rb") as file:
    model = pickle.load(file)

In [23]:
predictions = model.predict(encoded_test_data)

In [24]:
test_data['Predicted_Premium_Amount'] = predictions

In [25]:
test_data.to_csv("F:\\Guvi Projects\\Smart_Premium\\research_data\\Test_Predictions.csv", index=False)

In [26]:
pickle_path = 'F:\\Guvi Projects\\Smart_Premium\\pickles\\test_predictions.pkl'

with open(pickle_path, "wb") as file:
    pickle.dump(predictions, file)

print('test_predictions.pkl saved successfully...')

test_predictions.pkl saved successfully...


In [27]:
# To start you mlflow server -> use "mlflow server --host localhost --port 5005" in your terminal  P.S: You always forget the Crucial step

In [28]:
mlflow.set_tracking_uri("http://localhost:5005")

client = mlflow.tracking.MlflowClient()
client._request_max_retries = 5  
client._request_timeout = 60

predictions = model.predict(encoded_test_data).tolist()

pd.DataFrame(predictions, columns=["Predicted Premium"]).to_csv("Test_Predictions.csv", index=False)

logging.getLogger("mlflow").setLevel(logging.ERROR)

with mlflow.start_run():
    mlflow.sklearn.log_model(model, "Insurance_Premium_Model")
    mlflow.log_params({"model_name": model.__class__.__name__})
    mlflow.log_artifact("Test_Predictions.csv")
    
    for i in range(min(5, len(predictions))):  # Avoid index error
        mlflow.log_metric(f"Predicted Premium {i}", predictions[i])

print("Predictions logged in MLflow and saved to Test_Predictions.csv.")

MlflowException: API request to http://localhost:5005/api/2.0/mlflow/runs/create failed with exception HTTPConnectionPool(host='localhost', port=5005): Max retries exceeded with url: /api/2.0/mlflow/runs/create (Caused by ResponseError('too many 500 error responses'))