# Training on imputated datasets

Purpose: The goal of this notebook is to retrain the baseline models on the full dataset with 2 versions of imputated data:

1. nulls filled by median 
2. nulls filled by median after clipping outliers 

## 0. imports/config values

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor


# Explicitly define the target column
target_column = 'Premium Amount'

test_data = pd.read_csv("../data/raw_dataset/test.csv")

## 1. nulls filled by median values

For this we will experiment again with the stratified split dataset along with the basic train-test split

### 1.1 basic train split

In [1]:
base_dataset = pd.read_csv("../data/03_imputation_median.csv")

In [7]:
# Separate features (X) and target (y)
X = base_dataset.drop(columns=[target_column])
y = base_dataset[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define baseline unstratified_models
unstratified_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor()
}

# Train and evaluate each model
unstratified_results = []

for name, model in unstratified_models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store unstratified_results
    unstratified_results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R^2": r2
    })
    print(f"{name} training complete")

    

# Display unstratified_results
unstratified_results_df = pd.DataFrame(unstratified_results).sort_values(by="R^2", ascending=False)
unstratified_results_df

begin Linear Regression training
Linear Regression training complete
begin Ridge Regression training
Ridge Regression training complete
begin Decision Tree training
Decision Tree training complete


Unnamed: 0,Model,MSE,MAE,R^2
1,Ridge Regression,745247.2,667.274782,0.002728
0,Linear Regression,745247.2,667.274782,0.002728
2,Decision Tree,1520630.0,896.447588,-1.034871


### 1.2 stratified split
stratified split yields better validation results, and is likely better to use.

In [None]:
# Bin the target variable
n_bins = 20
bin_col = "y_bin"
y_binned = pd.qcut(y, q=n_bins, duplicates='drop', labels=False)
X[bin_col] = y_binned

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= X["y_bin"])

#remove the stratification column
X_test = X_test.drop(columns=[bin_col])
X_train = X_train.drop(columns=[bin_col])

# Define baseline stratified_models
stratified_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
}

# Train and evaluate each model
stratified_results = []

for name, model in stratified_models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store stratified_results
    stratified_results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R^2": r2
    })
    print(f"{name} training complete")

    

# Display stratified_results
stratified_results_df = pd.DataFrame(stratified_results).sort_values(by="R^2", ascending=False)
stratified_results_df

begin Linear Regression training
Linear Regression training complete
begin Ridge Regression training
Ridge Regression training complete
begin Decision Tree training
Decision Tree training complete


Unnamed: 0,Model,MSE,MAE,R^2
0,Linear Regression,744217.0,667.146285,0.00299
1,Ridge Regression,744217.0,667.146286,0.00299
2,Decision Tree,1518349.0,894.944854,-1.034096


### 1.3 No test set
Realistically, whenever we find a solution that performs well on the validation set(s), we then should retrain without any split and submit that version

In [10]:
# Separate features (X) and target (y)
X = base_dataset.drop(columns=[target_column])
y = base_dataset[target_column]

# Define baseline full_models
full_models = {
    "Linear Regression": LinearRegression(),
}

# Train and evaluate each model
full_results = []

for name, model in full_models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X, y)


begin Linear Regression training


### 1.4 Submitting results

Since the competition is limited to 5 submissions a day, I'll use the top performers of the two training versions (both linear regression) and submit it

#### 1.4.1 prepare test set

In [13]:
median_test = test_data.copy()


#minimally process the test dataset to get model predictions

#convert the policy start time to duration in mins
median_test['Policy Start Date'] = pd.to_datetime(median_test['Policy Start Date'])
median_test['Policy Duration Mins'] = ((pd.Timestamp.now() - median_test['Policy Start Date']).dt.total_seconds())/60
median_test = median_test.drop(columns=['Policy Start Date'])

#do label encoding
categorical_cols = [median_test.columns[i] for i, x in enumerate(median_test.dtypes) if x == 'object']
median_test[categorical_cols] = median_test[categorical_cols].astype('category')
# Convert categorical to one hot encodings
median_test = pd.get_dummies(median_test, drop_first=True)
#fill nulls with median values
median_test = median_test.fillna(median_test.median())


features = median_test.iloc[:,1:]


#### 1.4.2 submit to competition

In [14]:
#generate results and submit to competition
results_directory = "../results"

for name, model in full_models.items():
    
    y_pred = model.predict(median_features)

    results = pd.DataFrame({
        'id': median_test['id'],  
        'Premium Amount': y_pred   
    })

    filename = f"{name}_median_imputation.csv"
    results_full_path = os.path.join(results_directory,filename)
    
    results.to_csv(results_full_path, index=False)

    submission_comment = f"{name} with median imputed values"
    submit(filename,submission_comment) 

Competition Name: playground-series-s4e12
Full Submission File Path: ../results\Linear Regression_median_imputation.csv


100%|██████████| 21.2M/21.2M [00:09<00:00, 2.33MB/s]


Submission to 'playground-series-s4e12' successful!
submission result: Successfully submitted to Regression with an Insurance Dataset


## 2. Nulls filled by median values after clipping

In [3]:
clipped_dataset = pd.read_csv("../data/03_imputation_clipped_mean.csv")

### 2.1 Train (using stratified split)

In [34]:
# Separate features (X) and target (y)
X = clipped_dataset.drop(columns=[target_column])
y = clipped_dataset[target_column]

# Bin the target variable
n_bins = 20
bin_col = "y_bin"
y_binned = pd.qcut(y, q=n_bins, duplicates='drop', labels=False)
X[bin_col] = y_binned

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= X["y_bin"])

#remove the stratification column
X_test = X_test.drop(columns=[bin_col])
X_train = X_train.drop(columns=[bin_col])

# Define baseline stratified_models
stratified_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
}

# Train and evaluate each model
stratified_results = []

for name, model in stratified_models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store stratified_results
    stratified_results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R^2": r2
    })
    print(f"{name} training complete")


# Display stratified_results
stratified_results_df = pd.DataFrame(stratified_results).sort_values(by="R^2", ascending=False)
stratified_results_df

begin Linear Regression training
Linear Regression training complete
begin Ridge Regression training
Ridge Regression training complete
begin Decision Tree training
Decision Tree training complete


Unnamed: 0,Model,MSE,MAE,R^2
0,Linear Regression,744138.0,667.085915,0.003096
1,Ridge Regression,744138.0,667.085916,0.003096
2,Decision Tree,1519755.0,896.557454,-1.03598


### 2.2 Retraining on full dataset

In [4]:
# Separate features (X) and target (y)
X = clipped_dataset.drop(columns=[target_column])
y = clipped_dataset[target_column]

# Define baseline full_models
full_models = {
    "Linear Regression": LinearRegression(),
}

# Train and evaluate each model
full_results = []

for name, model in full_models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X, y)


begin Linear Regression training


### 2.3 submission

#### 2.3.1 prepare test set

In [2]:
mean_test = test_data.copy()


#minimally process the test dataset to get model predictions

#convert the policy start time to duration in mins
mean_test['Policy Start Date'] = pd.to_datetime(mean_test['Policy Start Date'])
mean_test['Policy Duration Mins'] = ((pd.Timestamp.now() - mean_test['Policy Start Date']).dt.total_seconds())/60
mean_test = mean_test.drop(columns=['Policy Start Date'])

#do label encoding
categorical_cols = [mean_test.columns[i] for i, x in enumerate(mean_test.dtypes) if x == 'object']
mean_test[categorical_cols] = mean_test[categorical_cols].astype('category')
# Convert categorical to one hot encodings
mean_test = pd.get_dummies(mean_test, drop_first=True)
#fill nulls with median values
mean_test = mean_test.fillna(mean_test.mean())


mean_features = mean_test.iloc[:,1:]



#### 2.3.2 submit to competition

In [6]:
#generate results and submit to competition
results_directory = "../results"

for name, model in full_models.items():
    
    y_pred = model.predict(mean_features)

    results = pd.DataFrame({
        'id': mean_test['id'],  
        'Premium Amount': y_pred   
    })

    filename = f"{name}_mean_imputation.csv"
    results_full_path = os.path.join(results_directory,filename)
    
    results.to_csv(results_full_path, index=False)

    submission_comment = f"{name} with clipping and mean imputed values"
    submit(filename,submission_comment) 

Competition Name: playground-series-s4e12
Full Submission File Path: ../results\Linear Regression_mean_imputation.csv


100%|██████████| 21.2M/21.2M [00:21<00:00, 1.03MB/s]


Submission to 'playground-series-s4e12' successful!
submission score: None
