# Train Base Models

Purpose:

Now that we have a minimally processed dataset, we'll try out a few baseline models:
-    Linear Regression
-    Ridge Regression
-    Decision Tree: 
-    Random Forest: 
-    Support Vector Regressor: 

In [None]:
%pip install -r ../../requirements.txt

# load dataset

In [15]:
base_dataset = pd.read_csv('../data/baseline_model_dataset.csv')
test_dataset = pd.read_csv('../data/raw_dataset/test.csv')

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


In [6]:
# Explicitly define the target column
target_column = 'Premium Amount'

# Separate features (X) and target (y)
X = base_dataset.drop(columns=[target_column])
y = base_dataset[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define baseline models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    # "Random Forest": RandomForestRegressor(n_estimators=100),
    # "Support Vector Regressor": SVR()
}

# Train and evaluate each model
results = []

for name, model in models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R^2": r2
    })
    print(f"{name} training complete")

    

# Display results
results_df = pd.DataFrame(results).sort_values(by="R^2", ascending=False)
results_df

begin Linear Regression training
Linear Regression training complete
begin Ridge Regression training
Ridge Regression training complete
begin Decision Tree training
Decision Tree training complete


Unnamed: 0,Model,MSE,MAE,R^2
1,Ridge Regression,751455.9,671.03236,0.003285
0,Linear Regression,751455.9,671.03236,0.003285
2,Decision Tree,1529073.0,898.257432,-1.028129


In [16]:
#minimally process the test dataset to get model predictions

#do label encoding
categorical_cols = [test_dataset.columns[i] for i, x in enumerate(test_dataset.dtypes) if x == 'object']
test_dataset[categorical_cols] = test_dataset[categorical_cols].astype('category')
# Convert categorical to label encodings
for col in categorical_cols:
    test_dataset[col] = test_dataset[col].cat.codes
#convert the policy start time to duration in mins
test_dataset['Policy Start Date'] = pd.to_datetime(test_dataset['Policy Start Date'])
test_dataset['Policy Duration Mins'] = ((pd.Timestamp.now() - test_dataset['Policy Start Date']).dt.total_seconds())/60
test_dataset = test_dataset.drop(columns=['Policy Start Date'])
#fill nulls with median values
test_dataset = test_dataset.fillna(test_dataset.median())


features = test_dataset.iloc[:,1:]

In [17]:
#generate results and submit to competition
results_directory = "../results"

for name, model in models.items():
    
    y_pred = model.predict(features)

    results = pd.DataFrame({
        'id': test_dataset['id'],  
        'Premium Amount': y_pred   
    })

    filename = f"{name}_baseline.csv"
    results_full_path = os.path.join(results_directory,filename)
    
    results.to_csv(results_full_path, index=False)

    submission_comment = f"baseline: {name} + minimally processed dataset"
    submit(filename,submission_comment) #custom function
    # print(f"successfully submitted baseline {name}")


Competition Name: playground-series-s4e12
Full Submission File Path: ../results\Linear Regression_baseline.csv


100%|██████████| 21.2M/21.2M [00:17<00:00, 1.25MB/s]


Submission to 'playground-series-s4e12' successful!
successfully submitted baseline Linear Regression
Competition Name: playground-series-s4e12
Full Submission File Path: ../results\Ridge Regression_baseline.csv


100%|██████████| 21.2M/21.2M [00:18<00:00, 1.18MB/s]


Submission to 'playground-series-s4e12' successful!
successfully submitted baseline Ridge Regression
Competition Name: playground-series-s4e12
Full Submission File Path: ../results\Decision Tree_baseline.csv


100%|██████████| 11.8M/11.8M [00:10<00:00, 1.24MB/s]


Submission to 'playground-series-s4e12' successful!
successfully submitted baseline Decision Tree
