In [2]:
pip install factor_analyzer




In [17]:
import pandas as pd
from factor_analyzer import FactorAnalyzer, calculate_kmo
from scipy.stats import bartlett
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Cargar el conjunto de datos
file_path = "../Starbucks satisfactory survey encode cleaned.csv"
df = pd.read_csv(file_path)

# Seleccionar las columnas relevantes para el análisis
data = df[['chooseRate', 'promoMethodApp', 'promoMethodSoc', 'promoMethodEmail', 'promoMethodDeal', 
           'promoMethodFriend', 'promoMethodDisplay', 'promoMethodBillboard', 'promoMethodOthers']]

# Imputar valores faltantes con la media de cada columna
data.fillna(data.mean(), inplace=True)

# Normalizar los datos
data = (data - data.mean()) / data.std()

# Verificar si aún hay valores NaN o infinitos
print(data.isna().sum())
print(np.isinf(data).sum())

# Cálculo de la medida KMO y prueba de Esfericidad de Bartlett
kmo_all, kmo_model = calculate_kmo(data)
print(f'KMO: {kmo_model:.2f}')

chi2, p_value = bartlett(*data.values.T)
print(f'Chi2: {chi2:.2f}, p-value: {p_value:.2f}')

# Análisis Factorial Exploratorio (EFA)
fa = FactorAnalyzer(n_factors=3, rotation='varimax')
fa.fit(data)

# Cargas factoriales
loadings = fa.loadings_
print('Cargas factoriales:')
print(loadings)

# Simulación de datos para CFA
data['Factor1'] = np.dot(data.iloc[:, :3], loadings[:3, 0])
data['Factor2'] = np.dot(data.iloc[:, 3:7], loadings[3:7, 1])
data['Factor3'] = np.dot(data.iloc[:, 7:], loadings[7:, 2])

# Modelo CFA
model = ols('Factor1 ~ chooseRate + promoMethodApp + promoMethodSoc', data=data).fit()
print(model.summary())

# Índices de ajuste para CFA
def calc_fit_indices(model):
    chi2 = model.ssr / model.df_resid
    rmsea = np.sqrt(chi2 / (model.nobs - model.df_model - 1))
    return {
        'Chi2': chi2,
        'RMSEA': rmsea,
        'CFI': sm.regression.linear_model.OLS(data['Factor1'], sm.add_constant(data.iloc[:, :5])).fit().rsquared
    }

fit_indices = calc_fit_indices(model)
print('Índices de ajuste:')
print(fit_indices)


chooseRate                0
promoMethodApp          113
promoMethodSoc          113
promoMethodEmail        113
promoMethodDeal         113
promoMethodFriend       113
promoMethodDisplay      113
promoMethodBillboard    113
promoMethodOthers         0
dtype: int64
chooseRate              0
promoMethodApp          0
promoMethodSoc          0
promoMethodEmail        0
promoMethodDeal         0
promoMethodFriend       0
promoMethodDisplay      0
promoMethodBillboard    0
promoMethodOthers       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(data.mean(), inplace=True)
  r = _umath_linalg.det(a, signature=signature)


LinAlgError: SVD did not converge

In [18]:
import pandas as pd

# Cargar el conjunto de datos
file_path = "../Starbucks satisfactory survey encode cleaned.csv"
df = pd.read_csv(file_path)

# Seleccionar las columnas relevantes para el análisis
data = df[['chooseRate', 'promoMethodApp', 'promoMethodSoc', 'promoMethodEmail', 'promoMethodDeal', 
           'promoMethodFriend', 'promoMethodDisplay', 'promoMethodBillboard', 'promoMethodOthers']]

# Imputar valores faltantes con la mediana de cada columna
data.fillna(data.median(), inplace=True)

# Mostrar estadísticas descriptivas
print(data.describe())


       chooseRate  promoMethodApp  promoMethodSoc  promoMethodEmail  \
count  113.000000           113.0           113.0             113.0   
mean     3.539823             1.0             1.0               1.0   
std      1.026744             0.0             0.0               0.0   
min      1.000000             1.0             1.0               1.0   
25%      3.000000             1.0             1.0               1.0   
50%      4.000000             1.0             1.0               1.0   
75%      4.000000             1.0             1.0               1.0   
max      5.000000             1.0             1.0               1.0   

       promoMethodDeal  promoMethodFriend  promoMethodDisplay  \
count            113.0              113.0               113.0   
mean               1.0                1.0                 1.0   
std                0.0                0.0                 0.0   
min                1.0                1.0                 1.0   
25%                1.0             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(data.median(), inplace=True)


In [20]:
import pandas as pd
from factor_analyzer import FactorAnalyzer, calculate_kmo
from scipy.stats import bartlett
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Cargar el conjunto de datos
file_path = "../Starbucks satisfactory survey encode cleaned.csv"
df = pd.read_csv(file_path)

# Seleccionar las columnas relevantes para el análisis
data = df[['chooseRate', 'promoMethodApp', 'promoMethodSoc', 'promoMethodEmail', 'promoMethodDeal', 
           'promoMethodFriend', 'promoMethodDisplay', 'promoMethodBillboard', 'promoMethodOthers']]

# Imputar valores faltantes con la mediana de cada columna
data.fillna(data.median(), inplace=True)

# Mostrar estadísticas descriptivas
print("Estadísticas descriptivas:")
print(data.describe())

# Eliminar columnas con baja variabilidad (desviación estándar igual a 0)
data = data.loc[:, data.std() > 0]

# Verificar si quedan al menos dos columnas para el análisis
if data.shape[1] < 2:
    raise ValueError("No hay suficientes columnas válidas para realizar el análisis factorial.")

# Normalizar los datos
data = (data - data.mean()) / data.std()

# Verificar si aún hay valores NaN o infinitos
print(data.isna().sum())
print(np.isinf(data).sum())

# Calcular la matriz de correlación de Spearman
corr_matrix = data.corr(method='spearman')
print("Matriz de correlación de Spearman:")
print(corr_matrix)

# Eliminar columnas con valores NaN en la matriz de correlación
columns_to_drop = corr_matrix.columns[corr_matrix.isna().any()].tolist()
data = data.drop(columns=columns_to_drop)

# Verificar si quedan al menos dos columnas para el análisis
if data.shape[1] < 2:
    raise ValueError("No hay suficientes columnas válidas para realizar el análisis factorial.")

# Calcular KMO y la prueba de Esfericidad de Bartlett
kmo_all, kmo_model = calculate_kmo(data)
print(f'KMO: {kmo_model:.2f}')

chi2, p_value = bartlett(*data.values.T)
print(f'Chi2: {chi2:.2f}, p-value: {p_value:.2f}')

# Análisis Factorial Exploratorio (EFA)
fa = FactorAnalyzer(n_factors=2, rotation='varimax')
fa.fit(data)

# Cargas factoriales
loadings = fa.loadings_
print('Cargas factoriales:')
print(loadings)

# Simulación de datos para CFA
data['Factor1'] = np.dot(data.iloc[:, :2], loadings[:2, 0])
# Verificar si hay suficientes columnas para Factor2
if data.shape[1] > 2:
    data['Factor2'] = np.dot(data.iloc[:, 2:], loadings[2:, 1])

# Modelo CFA
if 'Factor2' in data.columns:
    model = ols('Factor1 ~ chooseRate + promoMethodOthers + Factor2', data=data).fit()
else:
    model = ols('Factor1 ~ chooseRate + promoMethodOthers', data=data).fit()
print(model.summary())

# Índices de ajuste para CFA
def calc_fit_indices(model):
    chi2 = model.ssr / model.df_resid
    rmsea = np.sqrt(chi2 / (model.nobs - model.df_model - 1))
    return {
        'Chi2': chi2,
        'RMSEA': rmsea,
        'CFI': sm.regression.linear_model.OLS(data['Factor1'], sm.add_constant(data.iloc[:, :2])).fit().rsquared
    }

fit_indices = calc_fit_indices(model)
print('Índices de ajuste:')
print(fit_indices)


Estadísticas descriptivas:
       chooseRate  promoMethodApp  promoMethodSoc  promoMethodEmail  \
count  113.000000           113.0           113.0             113.0   
mean     3.539823             1.0             1.0               1.0   
std      1.026744             0.0             0.0               0.0   
min      1.000000             1.0             1.0               1.0   
25%      3.000000             1.0             1.0               1.0   
50%      4.000000             1.0             1.0               1.0   
75%      4.000000             1.0             1.0               1.0   
max      5.000000             1.0             1.0               1.0   

       promoMethodDeal  promoMethodFriend  promoMethodDisplay  \
count            113.0              113.0               113.0   
mean               1.0                1.0                 1.0   
std                0.0                0.0                 0.0   
min                1.0                1.0                 1.0   
25%     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(data.median(), inplace=True)


ValueError: shapes (113,1) and (0,) not aligned: 1 (dim 1) != 0 (dim 0)