In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

In [4]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['id']

Label_encoder = LabelEncoder()

data['Sex_encoded'] = Label_encoder.fit_transform(data['Sex'])
data.drop('Sex', axis=1, inplace=True)

test['Sex_encoded'] = Label_encoder.fit_transform(test['Sex'])
test.drop('Sex', axis=1, inplace=True)

y = data['Rings']
X = data.drop('Rings', axis = 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Define custom scorer for RMSLE
def rmsle_score(y_true, y_pred):
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return -rmsle  # Negate RMSLE to maximize (since GridSearchCV maximizes by default)

# Make RMSLE scorer
rmsle_scorer = make_scorer(rmsle_score)

# Create RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=42)

# Define parameter distributions for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(range(10, 21)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Create RandomizedSearchCV object
random_search_rf = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_dist_rf, 
                                      n_iter=20, scoring=rmsle_scorer, cv=5, random_state=42)

# Perform randomized search
random_search_rf.fit(X_train, y_train)

# Get best estimator
best_rf_classifier_random = random_search_rf.best_estimator_

# Print best parameters and score
print("Best Parameters for Random Forest (Randomized Search):", random_search_rf.best_params_)
print("Best Score (RMSLE) for Random Forest (Randomized Search):", -random_search_rf.best_score_)


Best Parameters for Random Forest (Randomized Search): {'bootstrap': True, 'max_depth': 12, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 180}
Best Score (RMSLE) for Random Forest (Randomized Search): 0.15097594885090923


In [10]:
predictions = best_rf_classifier_random.predict(X_val)

# Ensure predictions are non-negative
predictions = np.clip(predictions, 0, None)

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(np.log1p(predictions), np.log1p(y_val)))
print("RMSLE Score:", rmsle)

RMSLE Score: 0.04623680404693483


In [11]:
submission_preds = best_rf_classifier_random.predict(test)

# Ensure submission predictions are non-negative
submission_preds = np.clip(submission_preds, 0, None)

# Round the predictions and convert them to integers
submission_preds_rounded = np.round(submission_preds).astype(int)

In [12]:
df = pd.DataFrame({'id': test_ids.values, 
                   'Rings': submission_preds_rounded
                   })

In [17]:
df.to_csv('submission2.csv', index=False)

In [16]:
# Check if any target value is negative
negative_targets_exist = (df['Rings'] < 0).any()

if negative_targets_exist:
    print("Negative target values exist.")
else:
    print("No negative target values.")


No negative target values.
