# Data Scaling and Encoding

In [11]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [12]:
trainset_df = pd.read_csv("../../data/processed/train_data_cleaned.csv")
test_df = pd.read_csv("../../data/raw/test.csv")
trainset_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,22.598761,Urban,Premium,17.0,5.0,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,15.569731,Rural,Comprehensive,12.0,2.0,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,47.177549,Suburban,Premium,14.0,3.0,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,10.938144,Rural,Basic,0.0,1.0,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,20.376094,Rural,Premium,8.0,4.0,Poor,Yes,Weekly,House,2022.0


### Codificación de Variables Categóricas Nominales

In [13]:
le_gender = preprocessing.LabelEncoder()
le_gender.fit(["Female","Male"])
trainset_df.loc[:,"Gender"]  = le_gender.transform(trainset_df.loc[:,"Gender"])

le_marital = preprocessing.LabelEncoder()
le_marital.fit(["Single", "Married", "Divorced"])
trainset_df.loc[:,"Marital Status"]  = le_marital.transform(trainset_df.loc[:,"Marital Status"])

le_policy = preprocessing.LabelEncoder()
le_policy.fit(["Basic", "Comprehensive", "Premium"])
trainset_df.loc[:,"Policy Type"]  = le_policy.transform(trainset_df.loc[:,"Policy Type"])

le_smoking = preprocessing.LabelEncoder()
le_smoking.fit(["Yes","No"])
trainset_df.loc[:,"Smoking Status"]  = le_smoking.transform(trainset_df.loc[:,"Smoking Status"])

le_property = preprocessing.LabelEncoder()
le_property.fit(["House", "Apartment", "Condo"])
trainset_df.loc[:,"Property Type"]  = le_property.transform(trainset_df.loc[:,"Property Type"])

trainset_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,0,10049.0,1,1.0,Bachelor's,22.598761,Urban,2,17.0,5.0,Poor,0,Weekly,2,2869.0
1,39.0,0,31678.0,0,3.0,Master's,15.569731,Rural,1,12.0,2.0,Average,1,Monthly,2,1483.0
2,23.0,1,25602.0,0,3.0,High School,47.177549,Suburban,2,14.0,3.0,Good,1,Weekly,2,567.0
3,21.0,1,141855.0,1,2.0,Bachelor's,10.938144,Rural,0,0.0,1.0,Poor,1,Daily,0,765.0
4,21.0,1,39651.0,2,1.0,Bachelor's,20.376094,Rural,2,8.0,4.0,Poor,1,Weekly,2,2022.0


### Codificación de Variables Categóricas Ordinales

In [14]:
education_mapping = {"High School": 0,"Bachelor's": 1,"Master's": 2,"PhD": 3}
trainset_df["Education Level"] = trainset_df["Education Level"].map(education_mapping)

location_mapping = {"Urban": 2, "Suburban": 1, "Rural": 0}
trainset_df["Location"] = trainset_df["Location"].map(location_mapping)

customer_mapping = {"Poor": 0, "Average": 1, "Good": 2}
trainset_df["Customer Feedback"] = trainset_df["Customer Feedback"].map(customer_mapping)

exercise_mapping = {"Daily": 3, "Weekly": 2, "Monthly": 1, "Rarely": 0}
trainset_df["Exercise Frequency"] = trainset_df["Exercise Frequency"].map(exercise_mapping)

trainset_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,0,10049.0,1,1.0,1,22.598761,2,2,17.0,5.0,0,0,2,2,2869.0
1,39.0,0,31678.0,0,3.0,2,15.569731,0,1,12.0,2.0,1,1,1,2,1483.0
2,23.0,1,25602.0,0,3.0,0,47.177549,1,2,14.0,3.0,2,1,2,2,567.0
3,21.0,1,141855.0,1,2.0,1,10.938144,0,0,0.0,1.0,0,1,3,0,765.0
4,21.0,1,39651.0,2,1.0,1,20.376094,0,2,8.0,4.0,0,1,2,2,2022.0


### Transformaciónes
Debido a que la variable a predecir "Premium Amount" tiene sesgo (cola a la derecha), se aplica una transformación logaritmica para que su distribución sea mas normal.

In [5]:
trainset_df["Premium Amount"] = np.log1p(trainset_df["Premium Amount"])

trainset_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,0,10049.0,1,1.0,1,22.598761,2,2,17.0,5.0,0,0,2,2,7.962067
1,39.0,0,31678.0,0,3.0,2,15.569731,0,1,12.0,2.0,1,1,1,2,7.302496
2,23.0,1,25602.0,0,3.0,0,47.177549,1,2,14.0,3.0,2,1,2,2,6.342121
3,21.0,1,141855.0,1,2.0,1,10.938144,0,0,0.0,1.0,0,1,3,0,6.641182
4,21.0,1,39651.0,2,1.0,1,20.376094,0,2,8.0,4.0,0,1,2,2,7.612337


In [15]:
trainset_df.to_csv("../../data/processed/train_data_transformed.csv", index=False)

## Testset
Se repiten los pasos anteriores para el test data.

In [7]:
test_df  = test_df.drop(columns=["id","Occupation","Previous Claims","Credit Score","Policy Start Date"])

In [8]:
# Transformación de 'Gender' usando LabelEncoder
le_gender = preprocessing.LabelEncoder()
le_gender.fit(["Female", "Male"])
test_df["Gender"] = test_df["Gender"].apply(
    lambda x: le_gender.transform([x])[0] if pd.notna(x) else np.nan
)

# Transformación de 'Marital Status' usando LabelEncoder
le_marital = preprocessing.LabelEncoder()
le_marital.fit(["Single", "Married", "Divorced"])
test_df["Marital Status"] = test_df["Marital Status"].apply(
    lambda x: le_marital.transform([x])[0] if pd.notna(x) else np.nan
)

# Transformación de 'Education Level' usando map
education_mapping = {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3}
test_df["Education Level"] = test_df["Education Level"].map(education_mapping)
test_df["Education Level"] = test_df["Education Level"].fillna(np.nan)

# Transformación de 'Location' usando map
location_mapping = {"Urban": 2, "Suburban": 1, "Rural": 0}
test_df["Location"] = test_df["Location"].map(location_mapping)
test_df["Location"] = test_df["Location"].fillna(np.nan)

# Transformación de 'Policy Type' usando LabelEncoder
le_policy = preprocessing.LabelEncoder()
le_policy.fit(["Basic", "Comprehensive", "Premium"])
test_df["Policy Type"] = test_df["Policy Type"].apply(
    lambda x: le_policy.transform([x])[0] if pd.notna(x) else np.nan
)

# Transformación de 'Customer Feedback' usando map
customer_mapping = {"Poor": 0, "Average": 1, "Good": 2}
test_df["Customer Feedback"] = test_df["Customer Feedback"].map(customer_mapping)
test_df["Customer Feedback"] = test_df["Customer Feedback"].fillna(np.nan)

# Transformación de 'Smoking Status' usando LabelEncoder
le_smoking = preprocessing.LabelEncoder()
le_smoking.fit(["Yes", "No"])
test_df["Smoking Status"] = test_df["Smoking Status"].apply(
    lambda x: le_smoking.transform([x])[0] if pd.notna(x) else np.nan
)

# Transformación de 'Exercise Frequency' usando map
exercise_mapping = {"Daily": 3, "Weekly": 2, "Monthly": 1, "Rarely": 0}
test_df["Exercise Frequency"] = test_df["Exercise Frequency"].map(exercise_mapping)
test_df["Exercise Frequency"] = test_df["Exercise Frequency"].fillna(np.nan)

# Transformación de 'Property Type' usando LabelEncoder
le_property = preprocessing.LabelEncoder()
le_property.fit(["House", "Apartment", "Condo"])
test_df["Property Type"] = test_df["Property Type"].apply(
    lambda x: le_property.transform([x])[0] if pd.notna(x) else np.nan
)

In [9]:
test_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,28.0,0,2310.0,,4.0,1,7.657981,0,0,19.0,1.0,0.0,1,2,2
1,31.0,0,126031.0,1.0,2.0,2,13.381379,1,2,14.0,8.0,2.0,1,0,0
2,47.0,0,17092.0,0.0,0.0,3,24.354527,2,1,16.0,9.0,1.0,1,1,1
3,28.0,0,30424.0,0.0,3.0,3,5.136225,1,1,3.0,5.0,0.0,1,3,2
4,24.0,1,10863.0,0.0,2.0,0,11.844155,1,2,14.0,7.0,1.0,0,2,2


In [10]:
test_df.to_csv("../../data/processed/test_data_transformed.csv", index=False)