## 207 -Applied Machine Learning Project: Predicting Attrition of an Online Store Site

#### Authors:

Diego Moss
Sammy Cayo
Conor Huh
Roz Huang
Jasmine Lau



## Loading Data from BigQuery and Binding Rows

In [None]:
## code block for initial data loading

## Initial Data Pre-Processing and Feature Extraction

In [None]:
## code block for preprocessing

## after binding rows we will need to extract the data from the columns with multiple data and put into their own columns

## should code categorical variables into one-encoding

## we need to manipulate and aggregate each user's data for each month for all the manipulated features in our list

## we should take a look at how bad we're missing data and decide for imputation or removal




## shaping the data in the right way for the LSTM model (3 dimensions)

def create_sequences(df, sequence_length):
    sequences = []
    targets = []
    user_ids = df['user_id'].unique()
    
    for user_id in user_ids:
        user_data = df[df['user_id'] == user_id]
        user_data = user_data.sort_values(by='month')  # Ensure data is sorted by time
        for i in range(len(user_data) - sequence_length):
            seq = user_data.iloc[i:i+sequence_length].drop(columns=['user_id', 'days_until_next_visit']).values # target can change depending on our attrition statistics
            target = user_data.iloc[i+sequence_length]['days_until_next_visit'] # target can change depending on our attrition statistics
            sequences.append(seq)
            targets.append(target)
    
    return sequences, np.array(targets)

# 12 months
sequence_length = 12

# creating our sequences and targets for putting into X and y objects
data_sequences, targets = create_sequences(df, sequence_length)

# Pad sequences to ensure all have the same length
data_sequences_padded = pad_sequences(data_sequences, maxlen=sequence_length, padding='post', dtype='float32')

# Convert to arrays and saving as X and y
X = np.array(data_sequences_padded)
y = np.array(targets)



## Splitting Data

In [None]:
## splitting data code block

## need to see distribution of target variable and distribution of months in the data. If unequal need to sample so that the training set has roughly equal targets and months

## may need to manually split the data, make sure we are splitting by user and not by month


## More Data Processing and EDA

In [None]:
from sklearn.preprocessing import StandardScaler


## EDA - visualizing distributions and correlations


    # histograms for each feature


    # correlations matrix/plot/heatmap 






## standardizing features based on the training distributions



# defining lists of indices for the numerical and categorical features and telling it to ignore one-hot encoded variables

# replace with indices for all the numeric columns to be standardized (need to calculate which indices after flattening from 3d to 2d)
numerical_features_indices = [None,None] 

# replace with indices for all the one-hot encoded columns to be ignored (need to calculate which indices after flattening from 3d to 2d)
one_hot_encoded_indices = [None,None] 




# making a preprocessor to standardize and ignore features (also this is were we can impute data if we'd like) 
numerical_transformer = Pipeline(steps=[ # this says to do the below manipulations to each specified column
    ('imputer', SimpleImputer(strategy='median')),  # This is the handling of missing values, placeholder for now
    ('scaler', StandardScaler()) # this standardizes
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer( # this takes the pipeline from before and maps which columns to apply the numerical transformations
    transformers=[
        ('num', numerical_transformer, numerical_features_indices),
        ('cat', 'passthrough', one_hot_encoded_indices)
    ]
)



# fitting on 2d reshaped X_train because the scaler won't work on 3 dimensions, then returning back into 3 dimensions
X_train = preprocessor.fit_transform(X_train.reshape(X_train.shape[0]*X_train.shape[1], X_train.shape[-1])).reshape(X_train.shape) 
# doing same for test set
X_test = preprocessor.transform(X_test.reshape(X_train.shape[0]*X_train.shape[1], X_test.shape[-1])).reshape(X_test.shape)




## Model Fitting and Architecture

In [None]:
from keras.models import Model, Sequential
from keras.layers import Input, LSTM, Dense, TimeDistributed


# may need to turn this into a function so we can use in hyperparameter tuning


## Defining Model

# need to separate the stable features from the dynamic features fix this once we know which features
X_train_dynamic = None
X_train_stable = None


# Input for dynamic features
dynamic_input = Input(shape=(X_train_dynamic.shape[1], X_train_dynamic.shape[2]))
x = LSTM(units=50, return_sequences=True)(dynamic_input) # the parantheses here are telling the function to apply only to the dynamic features


# Input for stable features
stable_input = Input(shape=(X_train_stable.shape[1],))
stable_repeated = Dense(X_train_dynamic.shape[1])(stable_input)
stable_repeated = RepeatVector(X_train_dynamic.shape[1])(stable_repeated)
stable_repeated = TimeDistributed(Dense(50))(stable_repeated)


# Concatenate LSTM output with stable features
x = Concatenate()([x, stable_input])

# TimeDistributed Dense layer to get prediction for each timestep
output = TimeDistributed(Dense(1, activation='sigmoid'))(x)

# Define the model
model = Model(inputs=[dynamic_input, stable_input], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy', 'precision']) 
# should look as precision because we care more about correctly catching attrition over falsely claiming someone will attrit

# Train the model
model.fit([X_train_dynamic, X_train_stable], y_train_scaled, epochs=10, validation_split=0.2)



## Hyperparameter Tuning

In [None]:
## placeholder

## we need to see how long it takes to fit one epoch, then do some math to see how many sets of hyperparameters we can test

## Evaluation Metrics

In [None]:
## placeholder