# Random forest baseline

Now that we have identified basic things about what works best for the dataset listed below, we will now train a single random forest on 1/10th of the stratified dataset to compare performance

### Key findings so far:

- **imputation**: clipping and filling nulls with mean  
- **encoding**: one-hot encoding
- **validation scores**: more reliable after performing stratified split
- **baseline**: logistic regression is best baseline model
- **heuristic baseline**: median value is **BEST SCORE SO FAR! -- 1.10815** 

### results
10% of dataset : 1.15910 (4.5 mins train+validation results)

25% of dataset: 

In [1]:
train_dataset = pd.read_csv("../data/03_imputation_clipped_mean.csv")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor



# Explicitly define the target column
target_column = 'Premium Amount'

test_data = pd.read_csv("../data/raw_dataset/test.csv")

In [3]:
# Separate features (X) and target (y)
X = train_dataset.drop(columns=[target_column])
y = train_dataset[target_column]

# Bin the target variable
n_bins = 20
bin_col = "y_bin"
y_binned = pd.qcut(y, q=n_bins, duplicates='drop', labels=False)
X[bin_col] = y_binned

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.25, random_state=42, stratify= X["y_bin"])

#remove the stratification column
X_test = X_test.drop(columns=[bin_col])
X_train = X_train.drop(columns=[bin_col])

# Define advanced baseline stratified_models
stratified_models = {
    "Random Forest": RandomForestRegressor(random_state=42)
}

# Train and evaluate each model
stratified_results = []

for name, model in stratified_models.items():
    print(f"begin {name} training")
    # Train the model
    model.fit(X_train, y_train)
    print("training complete!")
    # Make predictions
    y_pred = model.predict(X_test)
    print("predictions complete!")
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store stratified_results
    stratified_results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R^2": r2
    })
    print(f"{name} training complete")


# Display stratified_results
stratified_results_df = pd.DataFrame(stratified_results).sort_values(by="R^2", ascending=False)
stratified_results_df

begin Random Forest training
training complete!
predictions complete!
Random Forest training complete


Unnamed: 0,Model,MSE,MAE,R^2
0,Random Forest,731608.176086,657.973315,0.021928


In [4]:
mean_test = test_data.copy()


#minimally process the test dataset to get model predictions

#convert the policy start time to duration in mins
mean_test['Policy Start Date'] = pd.to_datetime(mean_test['Policy Start Date'])
mean_test['Policy Duration Mins'] = ((pd.Timestamp.now() - mean_test['Policy Start Date']).dt.total_seconds())/60
mean_test = mean_test.drop(columns=['Policy Start Date'])

#do label encoding
categorical_cols = [mean_test.columns[i] for i, x in enumerate(mean_test.dtypes) if x == 'object']
mean_test[categorical_cols] = mean_test[categorical_cols].astype('category')
# Convert categorical to one hot encodings
mean_test = pd.get_dummies(mean_test, drop_first=True)
#fill nulls with mean values
mean_test = mean_test.fillna(X.mean())


mean_features = mean_test.iloc[:,1:]



In [5]:
#generate results and submit to competition
results_directory = "../results"

for name, model in stratified_models.items():
    
    y_pred = model.predict(mean_features)

    results = pd.DataFrame({
        'id': mean_test['id'],  
        'Premium Amount': y_pred   
    })

    filename = f"{name}_advanced_baseline_25_percent_train.csv"
    results_full_path = os.path.join(results_directory,filename)
    
    results.to_csv(results_full_path, index=False)

    submission_comment = f"Advanced baseline model {name} using 25% of clipped mean dataset"
    submit(filename,submission_comment) 