# ***Great American Coffee Taste Test***

In [None]:
import pandas as pd

df=pd.read_csv('GACTT_RESULTS_ANONYMIZED_v2.csv')

In [None]:
# Mostrar los nombres de las columnas
print(df.columns)

Cargar las bibliotecas necesarias:

Pandas para la manipulación de datos.
StandardScaler para escalar los datos.
train_test_split para dividir el dataset en entrenamiento y prueba.
DecisionTreeClassifier para crear el modelo de árbol de decisión.
accuracy_score para calcular la precisión del modelo.

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Cargar el dataset (Great American Coffee Taste Test)
df = pd.read_csv('GACTT_RESULTS_ANONYMIZED_v2.csv')

# Verificar los nombres de las columnas relacionadas con "relacion Calidad Precio"
print([col for col in df.columns if 'value for your money' in col])

# Conversión de la columna '¿Cuántas tazas de café bebes normalmente al día?' a numérico
# Reemplazamos cualquier valor no numérico con NaN, luego calculamos la media
df['How many cups of coffee do you typically drink per day?'] = pd.to_numeric(df['How many cups of coffee do you typically drink per day?'], errors='coerce')

# Rellenar los valores nulos con la media de la columna
df['How many cups of coffee do you typically drink per day?'].fillna(df['How many cups of coffee do you typically drink per day?'].mean(), inplace=True)

# Convertir variables categóricas en variables dummies (one-hot encoding) 0 y 1
df = pd.get_dummies(df)

# Imprimir los nombres de columna después de la codificación para identificar cambios
print(df.columns)

# Separación de características (X) y etiquetas (Y)
# Usamos el nombre de columna correcto después de la codificación (revisa la salida anterior)
#¿Siente que está obteniendo una buena relación calidad-precio cuando compra café en una cafetería?
x = df.drop('Do you feel like you’re getting good value for your money when you buy coffee at a cafe?_Yes', axis=1)  # INPUTS
y = df['Do you feel like you’re getting good value for your money when you buy coffee at a cafe?_Yes']  # OUTPUTS

# Dividir el dataset en datos de entrenamiento (80%) y prueba (20%)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Escalar los datos
#para que tengan una media de 0 y una desviación estándar de 1
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Modelo de árbol de decisión
modelo = DecisionTreeClassifier()

# Entrenar el modelo
modelo.fit(x_train, y_train)

# Predicciones
y_pred = modelo.predict(x_test)

# Calcula el modelo con precisión
precision = accuracy_score(y_test, y_pred)
print('Precisión: ', precision)

['Do you feel like you’re getting good value for your money when you buy coffee at a cafe?', 'Do you feel like you’re getting good value for your money with regards to your coffee equipment?']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['How many cups of coffee do you typically drink per day?'].fillna(df['How many cups of coffee do you typically drink per day?'].mean(), inplace=True)


Index(['How many cups of coffee do you typically drink per day?',
       'What kind of flavorings do you add?',
       'What kind of flavorings do you add? (Vanilla Syrup)',
       'What kind of flavorings do you add? (Caramel Syrup)',
       'What kind of flavorings do you add? (Hazelnut Syrup)',
       'What kind of flavorings do you add? (Cinnamon (Ground or Stick))',
       'What kind of flavorings do you add? (Peppermint Syrup)',
       'What kind of flavorings do you add? (Other)',
       'What other flavoring do you use?',
       'Lastly, how would you rate your own coffee expertise?',
       ...
       'Employment Status_Student', 'Employment Status_Unemployed',
       'Number of Children_1', 'Number of Children_2', 'Number of Children_3',
       'Number of Children_More than 3', 'Political Affiliation_Democrat',
       'Political Affiliation_Independent',
       'Political Affiliation_No affiliation',
       'Political Affiliation_Republican'],
      dtype='object', length=148

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Precisión:  0.9851668726823238


# Resultados de la Precisión

Precisión: 0.9851668726823238

Interpretación: Esto significa que el modelo de árbol de decisión clasificó correctamente aproximadamente el 98.52% de las instancias en el conjunto de prueba. En otras palabras, casi todas las predicciones que realizó fueron correctas.