# Neural Network


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error

In [23]:
# Load the training and test data
train_data_path = '../resources/train.csv'

df_train = pd.read_csv(train_data_path)

In [44]:
columns_to_drop = ['id', 'color_exterior', 'color_interior', 'tipo_combustible', 'accidente', 'marca']
df_train = df_train.drop(columns=[col for col in columns_to_drop if col in df_train.columns])

df_train['sin_daños'] = df_train['sin_daños'].fillna("Unknown")

# Separate features and target
X = df_train.drop(columns=['precio'])
y = df_train['precio']

# Encode categorical features
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Scale numeric features
scaler = StandardScaler()
X[X.select_dtypes(include=['int64', 'float64']).columns] = scaler.fit_transform(X.select_dtypes(include=['int64', 'float64']))

# Define KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize model performance lists
mae_scores = []

# Cross-validation loop
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Build the model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_val, y_val))

    # Predict and evaluate
    y_pred = model.predict(X_val).flatten()
    mae = mean_absolute_error(y_val, y_pred)
    mae_scores.append(mae)
    print(f"Fold MAE: {mae}")

# Report the cross-validated mean MAE
print(f"\nCross-validated Mean MAE: {np.mean(mae_scores):.4f}")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 618us/step - loss: 7523160576.0000 - mae: 34797.7852 - val_loss: 4100219904.0000 - val_mae: 22463.0840
Epoch 2/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 581us/step - loss: 6348420608.0000 - mae: 23327.4648 - val_loss: 4056389888.0000 - val_mae: 21197.4082
Epoch 3/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 588us/step - loss: 5998876672.0000 - mae: 22074.8906 - val_loss: 4037700864.0000 - val_mae: 20737.5137
Epoch 4/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 584us/step - loss: 5672191488.0000 - mae: 21415.7227 - val_loss: 4030234368.0000 - val_mae: 20432.3555
Epoch 5/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 580us/step - loss: 6110620672.0000 - mae: 21845.3105 - val_loss: 4026340096.0000 - val_mae: 20310.4336
Epoch 6/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 580us/step - loss: 658

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 605us/step - loss: 8173855232.0000 - mae: 35530.1992 - val_loss: 5592975360.0000 - val_mae: 23512.6230
Epoch 2/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 575us/step - loss: 5025401856.0000 - mae: 22890.6055 - val_loss: 5551115264.0000 - val_mae: 22438.3281
Epoch 3/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 563us/step - loss: 5361023488.0000 - mae: 21766.1445 - val_loss: 5524818944.0000 - val_mae: 22040.4277
Epoch 4/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 571us/step - loss: 5361196544.0000 - mae: 21548.3535 - val_loss: 5512519168.0000 - val_mae: 21635.3398
Epoch 5/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 574us/step - loss: 5328053248.0000 - mae: 21104.5723 - val_loss: 5507100160.0000 - val_mae: 21605.2871
Epoch 6/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 592us/step - loss: 599

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 584us/step - loss: 7063368704.0000 - mae: 34949.1523 - val_loss: 6737541632.0000 - val_mae: 24103.1641
Epoch 2/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 566us/step - loss: 5015931392.0000 - mae: 22573.3145 - val_loss: 6680872960.0000 - val_mae: 22315.9199
Epoch 3/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 560us/step - loss: 5170360320.0000 - mae: 21415.2773 - val_loss: 6654256640.0000 - val_mae: 21816.1621
Epoch 4/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 564us/step - loss: 5373192192.0000 - mae: 21182.6465 - val_loss: 6645300224.0000 - val_mae: 21767.7676
Epoch 5/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 562us/step - loss: 5449452544.0000 - mae: 21389.2617 - val_loss: 6640987136.0000 - val_mae: 21708.4004
Epoch 6/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 567us/step - loss: 490

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 580us/step - loss: 6964722688.0000 - mae: 35561.6406 - val_loss: 6608337408.0000 - val_mae: 23666.5098
Epoch 2/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 578us/step - loss: 5926472192.0000 - mae: 23414.4414 - val_loss: 6563680768.0000 - val_mae: 22480.7539
Epoch 3/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 621us/step - loss: 5041139712.0000 - mae: 21980.2793 - val_loss: 6539256320.0000 - val_mae: 21783.4531
Epoch 4/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 673us/step - loss: 5386603520.0000 - mae: 21309.8457 - val_loss: 6529076736.0000 - val_mae: 21710.8848
Epoch 5/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 607us/step - loss: 4953451520.0000 - mae: 21086.4668 - val_loss: 6524550656.0000 - val_mae: 21881.9062
Epoch 6/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 581us/step - loss: 548

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 548us/step - loss: 7431709184.0000 - mae: 35369.4609 - val_loss: 5255010816.0000 - val_mae: 23163.4062
Epoch 2/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 531us/step - loss: 6188959744.0000 - mae: 23431.6445 - val_loss: 5212799488.0000 - val_mae: 22124.8828
Epoch 3/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 528us/step - loss: 5697991680.0000 - mae: 22253.4590 - val_loss: 5186149888.0000 - val_mae: 21367.3086
Epoch 4/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 578us/step - loss: 5967199232.0000 - mae: 21844.6895 - val_loss: 5172986368.0000 - val_mae: 21197.7500
Epoch 5/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 564us/step - loss: 5697931264.0000 - mae: 21517.6328 - val_loss: 5166777344.0000 - val_mae: 20915.9961
Epoch 6/10
[1m4242/4242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 528us/step - loss: 598

In [45]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 135744 entries, 0 to 169678
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   numero       135744 non-null  float64
 1   modelo       135744 non-null  float64
 2   año_modelo   135744 non-null  float64
 3   millaje      135744 non-null  float64
 4   motor        135744 non-null  float64
 5   transmisión  135744 non-null  float64
 6   sin_daños    135744 non-null  float64
dtypes: float64(7)
memory usage: 8.3 MB
None


In [52]:
# Load the guess.csv
guess_df = pd.read_csv('../resources/guess.csv')

# Drop the same columns as in training
columns_to_drop = ['color_exterior', 'color_interior', 'tipo_combustible', 'accidente', 'marca', 'precio']
guess_df = guess_df.drop(columns=[col for col in columns_to_drop if col in guess_df.columns])

guess_df['sin_daños'] = guess_df['sin_daños'].fillna("Unknown")

# Separate the 'id' column to use it later in the output
id_column = guess_df['id']
guess_df = guess_df.drop(columns=['id'])

# Encode categorical features in the guess data
for col in guess_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    guess_df[col] = le.fit_transform(guess_df[col])

# Scale numeric features (same scaling used for training)
scaler = StandardScaler()
guess_df[guess_df.select_dtypes(include=['int64', 'float64']).columns] = scaler.fit_transform(guess_df.select_dtypes(include=['int64', 'float64']))

print(guess_df.info())
# Make predictions
predictions = model.predict(guess_df).flatten()

# Create a DataFrame with 'id' and 'prediction' columns
output_df = pd.DataFrame({
    'id': id_column,
    'prediction': predictions
})

# Save to CSV
output_df.to_csv('../output/predictions.csv', index=False)

print("Predictions have been saved to 'predictions.csv'")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18854 entries, 0 to 18853
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   numero       18854 non-null  float64
 1   modelo       18854 non-null  float64
 2   año_modelo   18854 non-null  float64
 3   millaje      18854 non-null  float64
 4   motor        18854 non-null  float64
 5   transmisión  18854 non-null  float64
 6   sin_daños    18854 non-null  float64
dtypes: float64(7)
memory usage: 1.0 MB
None
[1m590/590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 402us/step
Predictions have been saved to 'predictions.csv'
