# Train Base Models

### Purpose:

Repeat NB 3.1 but ensure a dataset split that is stratified

### Result:
stratified split helped a decent amount, RMSLE 1.16918 ->

BUT, median heuristic still outperforms with a score of 1.10815

In [None]:
%pip install -r ../../requirements.txt

# load dataset

In [16]:
base_dataset = pd.read_csv('../data/baseline_dataset_one_hot.csv')
test_dataset = pd.read_csv('../data/raw_dataset/test.csv')

In [17]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


In [18]:
# Explicitly define the target column
target_column = 'Premium Amount'

# Separate features (X) and target (y)
X = base_dataset.drop(columns=[target_column])
y = base_dataset[target_column]


# Bin the target variable
n_bins = 20
bin_col = "y_bin"
y_binned = pd.qcut(y, q=n_bins, duplicates='drop', labels=False)
X[bin_col] = y_binned

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= X["y_bin"])

#remove the stratification column
X_test = X_test.drop(columns=[bin_col])
X_train = X_train.drop(columns=[bin_col])

# Define baseline models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    # "Random Forest": RandomForestRegressor(n_estimators=100),
    # "Support Vector Regressor": SVR()
}

# Train and evaluate each model
results = []

for name, model in models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R^2": r2
    })
    print(f"{name} training complete")

    

# Display results
results_df = pd.DataFrame(results).sort_values(by="R^2", ascending=False)
results_df

begin Linear Regression training
Linear Regression training complete
begin Ridge Regression training
Ridge Regression training complete
begin Decision Tree training
Decision Tree training complete


Unnamed: 0,Model,MSE,MAE,R^2
1,Ridge Regression,751997.1,670.205323,0.0035
0,Linear Regression,751997.1,670.205322,0.0035
2,Decision Tree,1527455.0,897.573078,-1.024089


In [19]:
#minimally process the test dataset to get model predictions

#convert the policy start time to duration in mins
test_dataset['Policy Start Date'] = pd.to_datetime(test_dataset['Policy Start Date'])
test_dataset['Policy Duration Mins'] = ((pd.Timestamp.now() - test_dataset['Policy Start Date']).dt.total_seconds())/60
test_dataset = test_dataset.drop(columns=['Policy Start Date'])

#do label encoding
categorical_cols = [test_dataset.columns[i] for i, x in enumerate(test_dataset.dtypes) if x == 'object']
test_dataset[categorical_cols] = test_dataset[categorical_cols].astype('category')
# Convert categorical to one hot encodings
test_dataset = pd.get_dummies(test_dataset, drop_first=True)
#fill nulls with median values
test_dataset = test_dataset.fillna(test_dataset.median())


features = test_dataset.iloc[:,1:]

In [20]:
#generate results and submit to competition
results_directory = "../results"

for name, model in models.items():
    
    y_pred = model.predict(features)

    results = pd.DataFrame({
        'id': test_dataset['id'],  
        'Premium Amount': y_pred   
    })

    filename = f"{name}_baseline_one_hot_stratified.csv"
    results_full_path = os.path.join(results_directory,filename)
    
    results.to_csv(results_full_path, index=False)

    # submission_comment = f"baseline: {name} + minimally processed one-hot encoded dataset"
    # submit(filename,submission_comment) #custom function
    # print(f"successfully submitted baseline {name}")
