In [7]:
import pandas as pd
import numpy as np

In [10]:
df_clear = pd.read_csv("D:/DataSets/CrabAge/data_clear.csv")
df_clear.drop(columns = "Unnamed: 0", inplace = True)
df_clear.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,I,0.9125,0.6875,0.3375,6.80388,3.061746,1.261553,2.083688,7
1,F,1.3,1.0,0.325,17.704263,6.095142,5.854172,4.819415,15
2,F,1.3,1.0,0.325,17.704263,6.095142,5.854172,4.819415,15
3,M,0.825,0.5375,0.1875,3.246018,1.275727,0.751262,0.992232,6
4,F,1.3,1.0,0.325,17.704263,6.095142,5.854172,4.819415,15


# **Preparacion de Datos**

# One Hot Encoding

> Nuestra variable categorica `Sex` vamos a darle valores numéricos, para esto vamos a utilizar `one hot encoding`

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
encoder = OneHotEncoder(sparse_output = False)

In [11]:
encoded_data = encoder.fit_transform(df_clear[["Sex"]])
encoded_data

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]], shape=(3636, 3))

In [12]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(["Sex"]))
encoded_df

Unnamed: 0,Sex_F,Sex_I,Sex_M
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
...,...,...,...
3631,0.0,1.0,0.0
3632,0.0,0.0,1.0
3633,1.0,0.0,0.0
3634,1.0,0.0,0.0


In [13]:
df_clear.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,I,0.9125,0.6875,0.3375,6.80388,3.061746,1.261553,2.083688,7
1,F,1.3,1.0,0.325,17.704263,6.095142,5.854172,4.819415,15
2,F,1.3,1.0,0.325,17.704263,6.095142,5.854172,4.819415,15
3,M,0.825,0.5375,0.1875,3.246018,1.275727,0.751262,0.992232,6
4,F,1.3,1.0,0.325,17.704263,6.095142,5.854172,4.819415,15


In [14]:
encoded_df.head()

Unnamed: 0,Sex_F,Sex_I,Sex_M
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


In [15]:
df_clear = pd.concat([df_clear, encoded_df], axis=1)

df_clear.drop("Sex", axis = 1, inplace=True)
df_clear

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_F,Sex_I,Sex_M
0,0.9125,0.6875,0.3375,6.803880,3.061746,1.261553,2.083688,7,0.0,1.0,0.0
1,1.3000,1.0000,0.3250,17.704263,6.095142,5.854172,4.819415,15,1.0,0.0,0.0
2,1.3000,1.0000,0.3250,17.704263,6.095142,5.854172,4.819415,15,1.0,0.0,0.0
3,0.8250,0.5375,0.1875,3.246018,1.275727,0.751262,0.992232,6,0.0,0.0,1.0
4,1.3000,1.0000,0.3250,17.704263,6.095142,5.854172,4.819415,15,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3631,1.5875,1.2500,0.4125,42.212406,20.269893,9.766403,10.248344,13,0.0,1.0,0.0
3632,0.8250,0.5375,0.1875,3.246018,1.275727,0.751262,0.992232,6,0.0,0.0,1.0
3633,1.5500,1.1625,0.3500,28.661344,13.579410,6.761356,7.229122,8,1.0,0.0,0.0
3634,1.3000,1.0000,0.3250,17.704263,6.095142,5.854172,4.819415,15,1.0,0.0,0.0


# **Modelado**

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [17]:
X = df_clear.drop("Age", axis = 1)
y = df_clear["Age"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 17)
X_train

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_F,Sex_I,Sex_M
1075,0.9125,0.6875,0.3375,6.803880,3.061746,1.261553,2.083688,0.0,1.0,0.0
734,0.8250,0.5375,0.1875,3.246018,1.275727,0.751262,0.992232,0.0,0.0,1.0
513,0.8250,0.5375,0.1875,3.246018,1.275727,0.751262,0.992232,0.0,0.0,1.0
3010,0.9125,0.6875,0.3375,6.803880,3.061746,1.261553,2.083688,0.0,1.0,0.0
3588,1.3625,1.0500,0.4375,21.375523,7.257472,5.032036,7.796112,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
406,1.3000,1.0000,0.3250,17.704263,6.095142,5.854172,4.819415,1.0,0.0,0.0
1414,1.5875,1.2500,0.4125,42.212406,20.269893,9.766403,10.248344,0.0,1.0,0.0
2191,1.3250,1.0125,0.3750,23.572609,9.979024,5.301356,7.158249,0.0,0.0,1.0
3313,0.8625,0.6375,0.2250,5.684075,2.664853,0.836310,1.786018,0.0,0.0,1.0


In [18]:
rf_model = RandomForestRegressor(n_estimators = 100, random_state = 17)
rf_model.fit(X_train, y_train)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [19]:
rf_predictions = rf_model.predict(X_test).round()
lr_predictions = lr_model.predict(X_test).round()

In [20]:
rf_mse = mean_squared_error(y_test, rf_predictions)
lr_mse = mean_squared_error(y_test, lr_predictions)

print("Random Forest MSE:", rf_mse)
print("Linear Regression MSE:", lr_mse)

Random Forest MSE: 0.0
Linear Regression MSE: 0.3241758241758242


In [21]:
print("\nSample Predictions:")
num_samples = 5

for i in range(num_samples):
    print(f"   - Random Forest => Predicted : {rf_predictions[i]}, Actual: {y_test.iloc[i]}")
    print(f"   - linear Regression => Predicted : {lr_predictions[i]}, Actual: {y_test.iloc[i]}")
    print()


Sample Predictions:
   - Random Forest => Predicted : 10.0, Actual: 10
   - linear Regression => Predicted : 11.0, Actual: 10

   - Random Forest => Predicted : 10.0, Actual: 10
   - linear Regression => Predicted : 10.0, Actual: 10

   - Random Forest => Predicted : 10.0, Actual: 10
   - linear Regression => Predicted : 11.0, Actual: 10

   - Random Forest => Predicted : 6.0, Actual: 6
   - linear Regression => Predicted : 6.0, Actual: 6

   - Random Forest => Predicted : 6.0, Actual: 6
   - linear Regression => Predicted : 6.0, Actual: 6



# **Validación**

# **Despliegue**