# Machine Learning Pipeline - Model Training


In [None]:
!git clone https://github.com/LuisAybar06/deploy_model_fastapi.git

In [None]:
mv deploy_model_fastapi/* .

In [None]:
rm -rf deploy_model_fastapi

In [None]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to save the model
import joblib

# to build the model
from sklearn.linear_model import Lasso

# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

pd.pandas.set_option('display.max_columns', None)

In [None]:
# Cargar los datos
X_train = pd.read_csv('xtrain.csv')
Y_train = pd.read_csv('ytrain.csv')
X_test = pd.read_csv('xtest.csv')

y_test = pd.read_csv('ytest.csv')
y_test = y_test['SalePrice']

In [None]:
# Cargar las características seleccionadas
features = pd.read_csv('selected_features.csv')
features = features['0'].to_list()

In [None]:
# Filtrar los datasets para que solo incluyan las características seleccionadas
X_train = X_train[features]
X_test = X_test[features]

## TRAINNING


In [None]:
# Entrenar el modelo Lasso
lin_model = Lasso(alpha=0.001, random_state=0)
lin_model.fit(X_train, Y_train)

## SAVE THE MODEL

In [None]:
joblib.dump(lin_model, 'linear_regression.joblib')

In [None]:
import joblib

try:
    model = joblib.load('linear_regression.joblib')
    print("Modelo cargado correctamente.")
except Exception as e:
    print(f"Error al cargar el modelo: {e}")


## PREDICTION


In [None]:
# Hacer predicciones en el conjunto de test
y_pred = lin_model.predict(X_test)

## METRICAS


In [None]:
# Calcular las métricas de evaluación
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print(f'MSE: {mse}')
print(f'R²: {r2}')
print(f'MAE: {mae}')

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(lin_model, X_train, Y_train, cv=5)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Train Score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Validation Score')
plt.xlabel('Tamaño de Entrenamiento')
plt.ylabel('Puntuación')
plt.title('Curva de Aprendizaje')
plt.legend()
plt.show()


In [None]:
# Gráfica de Predicciones vs. Valores Reales
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title('Predicciones vs. Valores Reales')
plt.show()

In [None]:
# Histograma de errores
errors = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.hist(errors, bins=25, color='blue', edgecolor='black')
plt.xlabel('Error')
plt.ylabel('Frecuencia')
plt.title('Distribución de Errores')
plt.show()

In [None]:
import pandas as pd

y_test = pd.read_csv('ytest.csv')
y_test_values = y_test['SalePrice'].values

df_results = pd.DataFrame({
    'Prediccion': y_pred,
    'Valor_Real': y_test_values,
    'Porcentaje_Desviacion': ((y_pred - y_test_values) / y_test_values) * 100,
    'Flag': [1 if pred > real else 0 for pred, real in zip(y_pred, y_test_values)]
})


print(df_results.head())