# Preparación de los datos:

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Carga los datos desde el archivo CSV
data = pd.read_csv("../data/bike_buyers.csv")
data.dropna()
# Lidiar con valores faltantes
imputer = SimpleImputer(strategy='mean')  # Reemplaza los valores faltantes con la media
data['Income'] = imputer.fit_transform(data[['Income']])
data['Children'] = imputer.fit_transform(data[['Children']])
data['Age'] = imputer.fit_transform(data[['Age']])

# Codificar variables categóricas
encoder = OneHotEncoder(drop='first', sparse=False)  # Codifica variables categóricas en variables numéricas
categorical_cols = ['Marital Status', 'Gender', 'Education', 'Occupation', 'Home Owner', 'Commute Distance', 'Region']
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(data[categorical_cols]))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
data = pd.concat([data.drop(categorical_cols, axis=1), encoded_cols], axis=1)


# Dividir los datos en conjunto de entrenamiento y prueba
X = data.drop('Purchased Bike', axis=1)
y = data['Purchased Bike']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# Selección de la técnica de Data Mining:

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.pipeline import make_pipeline

# Crear un pipeline que incluya la imputación y el modelo de regresión logística
pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),  # Estrategia de imputación para reemplazar NaN con la media
    LogisticRegression()
)

# Entrenar el modelo utilizando el pipeline
pipeline.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = pipeline.predict(X_test)

# Evaluar el modelo
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

          No       0.57      0.63      0.60       106
         Yes       0.53      0.47      0.50        94

    accuracy                           0.56       200
   macro avg       0.55      0.55      0.55       200
weighted avg       0.55      0.56      0.55       200



In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Paso 1: Preparación de los datos
df = pd.read_csv('../data/bike_buyers.csv')  # Lee el archivo CSV
df = df.drop(['ID'], axis=1)  # Elimina la columna 'ID' irrelevante

# Codificación de variables categóricas
categorical_cols = ['Marital Status', 'Gender', 'Education', 'Occupation', 'Home Owner', 'Commute Distance', 'Region']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Manejo de valores perdidos
df_encoded = df_encoded.dropna()  # Elimina las filas con valores perdidos

# Reemplazar etiquetas de clase 'No' y 'Yes' con 0 y 1
df_encoded['Purchased Bike'] = df_encoded['Purchased Bike'].replace({'No': 0, 'Yes': 1})

# División del conjunto de datos en entrenamiento y prueba
X = df_encoded.drop('Purchased Bike', axis=1)
y = df_encoded['Purchased Bike']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Paso 2: Generación del plan de prueba

# Paso 3: Construcción del modelo
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluación del modelo
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Paso 4: Análisis de los clientes que compran una bicicleta
feature_importances = pd.Series(model.coef_[0], index=X_train.columns).sort_values(ascending=False)
# print("Importancia de las características:")
# print(feature_importances)

# Paso 5: Scoring de venta
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilidad de la clase positiva (compra)
scoring = y_pred_proba * 100  # Escala el valor a una puntuación de 0 a 100
results = pd.DataFrame({'Customer': X_test.index, 'Scoring': scoring})
results = results.sort_values(by='Scoring', ascending=False)
print("Scoring de venta:")
print(results['Scoring'].describe())


Accuracy: 0.5282051282051282
Precision: 0.6052631578947368
Recall: 0.23
F1 Score: 0.33333333333333337
Scoring de venta:
count    195.000000
mean      45.401405
std        5.069633
min       31.940096
25%       42.731019
50%       45.568979
75%       48.707382
max       60.560676
Name: Scoring, dtype: float64


In [64]:
df_pred = pd.DataFrame(X_test)
df_pred = df_pred.assign(Scoring = results['Scoring'])
# df_pred = df_pred.assign(Pred = y_pred.flatten().tolist())
df_pred

Unnamed: 0,Income,Children,Cars,Age,Marital Status_Single,Gender_Male,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School,...,Occupation_Professional,Occupation_Skilled Manual,Home Owner_Yes,Commute Distance_1-2 Miles,Commute Distance_10+ Miles,Commute Distance_2-5 Miles,Commute Distance_5-10 Miles,Region_North America,Region_Pacific,Scoring
173,10000.0,0.0,1.0,27.0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,45.223882
862,50000.0,0.0,0.0,32.0,0,1,1,0,0,0,...,0,1,1,1,0,0,0,1,0,
78,80000.0,2.0,2.0,50.0,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0,1,45.459238
72,130000.0,3.0,4.0,52.0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,45.425756
90,30000.0,0.0,1.0,29.0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,52.952263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,80000.0,5.0,3.0,46.0,0,1,0,0,1,0,...,1,0,1,1,0,0,0,1,0,
656,60000.0,2.0,2.0,50.0,0,1,0,1,0,0,...,1,0,0,0,0,0,1,1,0,
578,60000.0,4.0,2.0,59.0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,
35,10000.0,5.0,2.0,41.0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,48.606320
