# RANSAC

In [58]:
from sklearn.linear_model import RANSACRegressor, Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Splitting data
X = dfFeature.drop(['Balance'], axis=1)
y = dfFeature['Balance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [61]:

# Define RANSAC regressor with Ridge as the base estimator
ransac = RANSACRegressor(
    estimator=Ridge(alpha=1.0, random_state=42),
    random_state=42,
    min_samples=0.5  # Or an integer like `min_samples=50`
)
# Fit the RANSAC model
ransac.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = ransac.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Inlier mask: {ransac.inlier_mask_}")


Mean Squared Error (MSE): 3082793753.619123
Mean Absolute Percentage Error (MAPE): 4.923177322686527e+19
Root Mean Squared Error (RMSE): 55522.911969916735
R-squared (R²): 0.20108604047983236
Inlier mask: [ True  True  True ...  True False  True]


RANSAC regression did not perform well in this case, shown by the high errors and the low R squared. The target variable likely has a complex relationship with the features that RANSAC couldn't capture effectively.

# TheilSen

In [63]:
from sklearn.linear_model import TheilSenRegressor

# Splitting data
X = dfFeature.drop(['Balance'], axis=1)
y = dfFeature['Balance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define Theil-Sen regressor
theil_sen = TheilSenRegressor(random_state=42)

# Fit the Theil-Sen model
theil_sen.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = theil_sen.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

Mean Squared Error (MSE): 2842160216.0976925
Mean Absolute Percentage Error (MAPE): 7.295105910908207e+19
Root Mean Squared Error (RMSE): 53311.9143916038
R-squared (R²): 0.26344684292693066


The model still fails to capture the underlying relationships effectively.
Extremelly high MAPE, and poor performance overall.