In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [39]:
fact_table = pd.read_csv("../data/ml/fact_table_featured.csv")

In [40]:
fact_table

Unnamed: 0,plan,duration,mb_used,message_count,messages_included,mb_per_month_included,minutes_included,usd_monthly_pay,usd_per_gb,usd_per_message,usd_per_minute,extra_minutes,extra_mb,extra_messages,extra_cost,total_cost,prev_total_cost,prev_extra_minutes,prev_extra_mb,prev_extra_messages
0,ultimate,0.00,0.00,0.0,1000,30720,3000,70,7,0.01,0.01,0.0,0.00,0.0,0.000000,70.000000,70.000000,0.0,0.00,0.0
1,ultimate,0.00,0.00,0.0,1000,30720,3000,70,7,0.01,0.01,0.0,0.00,0.0,0.000000,70.000000,70.000000,0.0,0.00,0.0
2,ultimate,0.00,0.00,0.0,1000,30720,3000,70,7,0.01,0.01,0.0,0.00,0.0,0.000000,70.000000,70.000000,0.0,0.00,0.0
3,ultimate,0.00,0.00,0.0,1000,30720,3000,70,7,0.01,0.01,0.0,0.00,0.0,0.000000,70.000000,70.000000,0.0,0.00,0.0
4,ultimate,0.00,0.00,0.0,1000,30720,3000,70,7,0.01,0.01,0.0,0.00,0.0,0.000000,70.000000,70.000000,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,surf,0.00,0.00,0.0,50,15360,500,20,10,0.03,0.03,0.0,0.00,0.0,0.000000,20.000000,20.000000,0.0,0.00,0.0
4996,surf,330.37,12984.76,0.0,50,15360,500,20,10,0.03,0.03,0.0,0.00,0.0,0.000000,20.000000,20.000000,0.0,0.00,0.0
4997,surf,363.28,19492.43,0.0,50,15360,500,20,10,0.03,0.03,0.0,4132.43,0.0,40.355762,60.355762,20.000000,0.0,0.00,0.0
4998,surf,288.56,16813.83,0.0,50,15360,500,20,10,0.03,0.03,0.0,1453.83,0.0,14.197559,34.197559,60.355762,0.0,4132.43,0.0


In [41]:
fact_table["prev_extra_mb"] = fact_table["prev_extra_mb"] / 1024  # 1GB = 1024MB

# Linear Regression

In [42]:
# Seleccionar variables predictoras y la variable objetivo
features = ["prev_total_cost", "prev_extra_minutes", "prev_extra_mb", "prev_extra_messages", "usd_monthly_pay"]
X = fact_table[features]
y = fact_table["total_cost"]

# Separar en conjunto de entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inicializar y entrenar el modelo de Regresión Lineal
model = LinearRegression()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model.fit(X_train, y_train)

# Hacer predicciones
y_pred = model.predict(X_test)

# Evaluar el modelo
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Mostrar métricas de evaluación
print("Evaluación del Modelo de Regresión Lineal:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")


Evaluación del Modelo de Regresión Lineal:
MAE: 11.8012
RMSE: 26.2048
R^2: 0.5493


# Stochastic Gradient Descent

In [44]:
# Seleccionar variables predictoras y la variable objetivo
features = ["prev_total_cost", "prev_extra_minutes", "prev_extra_mb", "prev_extra_messages", "usd_monthly_pay"]
X = fact_table[features]
y = fact_table["total_cost"]

# Separar en conjunto de entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Estandarizar los datos para mejorar el rendimiento de SGD
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Inicializar y entrenar el modelo de Stochastic Gradient Descent
model = SGDRegressor(max_iter=1000, tol=1e-3, learning_rate='adaptive', eta0=0.01, random_state=42)
model.fit(X_train, y_train)

# Hacer predicciones
y_pred = model.predict(X_test)

# Evaluar el modelo
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Mostrar métricas de evaluación
print("Evaluación del Modelo de Stochastic Gradient Descent:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")


Evaluación del Modelo de Stochastic Gradient Descent:
MAE: 11.7925
RMSE: 26.2156
R^2: 0.5489


# Conclusions

Un MAE de ~11.8 indica que, en promedio, el modelo predice la facturación con un error de ±11.8 dólares.

Un RMSE de ~26.2 sugiere que algunas predicciones tienen errores más altos.

Un R² de ~0.55 significa que el modelo explica el 55% de la variabilidad en la facturación.

Segmentación de Clientes

Identificar usuarios con facturación alta y estable para ofrecerles planes premium.
Detectar clientes con cambios bruscos en su consumo y ofrecerles paquetes personalizados.
Ofertas y Promociones Personalizadas

Si un usuario constantemente supera su cuota de datos, ofrecerle un plan con más GB a menor costo adicional.
Usuarios con pocos minutos excedentes pueden recibir promociones de minutos gratis para fidelizarlos.
Detección de Riesgo de Churn (Abandono de Clientes)

Si el modelo predice un descenso fuerte en la facturación, puede indicar que un usuario está considerando cambiar de proveedor.
Se pueden activar descuentos o beneficios exclusivos para retenerlo.
Optimización de la Estructura de Planes

Si muchos clientes pagan costos adicionales altos, la empresa puede rediseñar sus planes para ajustarse mejor al consumo real.
Crear paquetes adicionales más accesibles basados en patrones de consumo.