In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU

from sklearn.metrics import accuracy_score

# my package
from utils.sparrowpy.data_science import modeling
from utils.sparrowpy.data_engg import sql

In [2]:
df = sql.get_table_df(
    query = """
    select 
        round(avg(ccnt.country_id), 2) as country_id,
        round(avg(cat.category_id), 2) as category_id,
        round(avg(st.store_id), 2) as store_id,
        round(avg(stf.staff_id), 2) as staff_id,
        round(avg(pay.amount), 2) as amount,
        round(avg(EXTRACT(DAY FROM (rnt.return_date - rnt.rental_date))- film.rental_duration), 2) AS return_delay,
        round(max(c.active), 2) as active
    from
        customer c
        join address cad 
            on c.address_id = cad.address_id
        join city
            on cad.city_id = city.city_id
        join country as ccnt
            on city.country_id = ccnt.country_id
        join store as st
            on c.store_id = st.store_id
        join payment as pay
            on c.customer_id = pay.customer_id
        join rental as rnt
            on c.customer_id = rnt.customer_id
        join inventory as inv
            on rnt.inventory_id = inv.inventory_id
        join film
            on inv.film_id = film.film_id
        join film_category as cat
            on film.film_id = cat.film_id
        join staff as stf
            on rnt.staff_id = stf.staff_id


    group by
        c.customer_id
    """
)
print(df)

postgresql+psycopg2://postgres:postgres@localhost:5432/dvd_rental

    select 
        round(avg(ccnt.country_id), 2) as country_id,
        round(avg(cat.category_id), 2) as category_id,
        round(avg(st.store_id), 2) as store_id,
        round(avg(stf.staff_id), 2) as staff_id,
        round(avg(pay.amount), 2) as amount,
        round(avg(EXTRACT(DAY FROM (rnt.return_date - rnt.rental_date))- film.rental_duration), 2) AS return_delay,
        round(max(c.active), 2) as active
    from
        customer c
        join address cad 
            on c.address_id = cad.address_id
        join city
            on cad.city_id = city.city_id
        join country as ccnt
            on city.country_id = ccnt.country_id
        join store as st
            on c.store_id = st.store_id
        join payment as pay
            on c.customer_id = pay.customer_id
        join rental as rnt
            on c.customer_id = rnt.customer_id
        join inventory as inv
            on rnt.inventory_id

In [3]:
target = 'active'
X = df.drop(target, axis=1)
y = df[target]

num_features = X.shape[1]

# Train-test split
X = np.array(X, dtype=float) 
y = np.array(y, dtype=float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69, stratify=y)

In [4]:
import tensorflow as tf
from sklearn.metrics import f1_score

def f1(y_true, y_pred):
    # Convert predictions to binary values
    y_pred = tf.round(y_pred)  # Assuming binary classification
    tp = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32))  # True Positives
    tn = tf.reduce_sum(tf.cast((1 - y_true) * (1 - y_pred), tf.float32))  # True Negatives
    fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, tf.float32))  # False Positives
    fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), tf.float32))  # False Negatives
    
    precision = tp / (tp + fp + tf.keras.backend.epsilon())
    recall = tp / (tp + fn + tf.keras.backend.epsilon())
    
    return 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())

In [5]:

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [6]:
# Build the ANN model with Dropout
model = Sequential()


model.add(Dense(units=8, input_dim=num_features))
model.add(Dropout(0.3))  
model.add(LeakyReLU(alpha=0.1))

# model.add(Dense(units=256, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

# model.add(Dense(units=128, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

# model.add(Dense(units=64, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

# model.add(Dense(units=32, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

# model.add(Dense(units=16, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

model.add(Dense(units=4))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.3))  

# model.add(Dense(units=4, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

# model.add(Dense(units=2, activation='relu'))
# model.add(LeakyReLU(alpha=0.1))

model.add(Dense(units=1, activation='sigmoid'))  # Sigmoid for binary classification


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

print(model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.2333 - loss: 1.0452 - val_accuracy: 0.1667 - val_loss: 0.8966
Epoch 2/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2900 - loss: 0.9902 - val_accuracy: 0.2917 - val_loss: 0.8406
Epoch 3/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3004 - loss: 0.9257 - val_accuracy: 0.3542 - val_loss: 0.7907
Epoch 4/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3212 - loss: 0.8904 - val_accuracy: 0.3958 - val_loss: 0.7496
Epoch 5/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4789 - loss: 0.8316 - val_accuracy: 0.5417 - val_loss: 0.7159
Epoch 6/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4666 - loss: 0.7989 - val_accuracy: 0.6458 - val_loss: 0.6890
Epoch 7/10
[1m14/14[0m [32m━━━━━━━━━

None


In [7]:
# Evaluate the model on test data

y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
f1score = f1_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test f1 score: {f1score:.4f}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Test Accuracy: 0.8750
Test f1 score: 0.9333


In [8]:
encoders = {}


df_encoder = sql.get_table_df(
    query = """
    select 
        category_id,
        name
    from
        category
    group by category_id
    order by category_id

    """
)
result_dict = df_encoder.set_index('name')['category_id'].to_dict()
print(result_dict)
encoders['category_id'] = result_dict


df_encoder = sql.get_table_df(
    query = """
    select 
        country_id,
        country
    from
        country
    group by country_id
    order by country

    """
)
result_dict = df_encoder.set_index('country')['country_id'].to_dict()
print(result_dict)
encoders['country_id'] = result_dict


df_encoder = sql.get_table_df(
    query = """
    select 
        staff_id,
        first_name
    from
        staff
    group by staff_id
    order by first_name

    """
)
result_dict = df_encoder.set_index('first_name')['staff_id'].to_dict()
print(result_dict)
encoders['staff_id'] = result_dict


encoders

postgresql+psycopg2://postgres:postgres@localhost:5432/dvd_rental

    select 
        category_id,
        name
    from
        category
    group by category_id
    order by category_id

    
{'Action': 1, 'Animation': 2, 'Children': 3, 'Classics': 4, 'Comedy': 5, 'Documentary': 6, 'Drama': 7, 'Family': 8, 'Foreign': 9, 'Games': 10, 'Horror': 11, 'Music': 12, 'New': 13, 'Sci-Fi': 14, 'Sports': 15, 'Travel': 16}
postgresql+psycopg2://postgres:postgres@localhost:5432/dvd_rental

    select 
        country_id,
        country
    from
        country
    group by country_id
    order by country

    
{'Afghanistan': 1, 'Algeria': 2, 'American Samoa': 3, 'Angola': 4, 'Anguilla': 5, 'Argentina': 6, 'Armenia': 7, 'Australia': 8, 'Austria': 9, 'Azerbaijan': 10, 'Bahrain': 11, 'Bangladesh': 12, 'Belarus': 13, 'Bolivia': 14, 'Brazil': 15, 'Brunei': 16, 'Bulgaria': 17, 'Cambodia': 18, 'Cameroon': 19, 'Canada': 20, 'Chad': 21, 'Chile': 22, 'China': 23, 'Colombia': 24, 'Congo, The Democratic R

{'category_id': {'Action': 1,
  'Animation': 2,
  'Children': 3,
  'Classics': 4,
  'Comedy': 5,
  'Documentary': 6,
  'Drama': 7,
  'Family': 8,
  'Foreign': 9,
  'Games': 10,
  'Horror': 11,
  'Music': 12,
  'New': 13,
  'Sci-Fi': 14,
  'Sports': 15,
  'Travel': 16},
 'country_id': {'Afghanistan': 1,
  'Algeria': 2,
  'American Samoa': 3,
  'Angola': 4,
  'Anguilla': 5,
  'Argentina': 6,
  'Armenia': 7,
  'Australia': 8,
  'Austria': 9,
  'Azerbaijan': 10,
  'Bahrain': 11,
  'Bangladesh': 12,
  'Belarus': 13,
  'Bolivia': 14,
  'Brazil': 15,
  'Brunei': 16,
  'Bulgaria': 17,
  'Cambodia': 18,
  'Cameroon': 19,
  'Canada': 20,
  'Chad': 21,
  'Chile': 22,
  'China': 23,
  'Colombia': 24,
  'Congo, The Democratic Republic of the': 25,
  'Czech Republic': 26,
  'Dominican Republic': 27,
  'Ecuador': 28,
  'Egypt': 29,
  'Estonia': 30,
  'Ethiopia': 31,
  'Faroe Islands': 32,
  'Finland': 33,
  'France': 34,
  'French Guiana': 35,
  'French Polynesia': 36,
  'Gambia': 37,
  'Germany': 38

In [9]:
def result(prediction):
    prediction = round(float(prediction[0][0]))
    if prediction >= 0.5:
        return "The model predicts: **The customer will not churn**"
    else:
        return "The model predicts: **The customer will churn**"

In [10]:
modeling.save_model(
    model_name = 'churn',
    target = target,
    model = model,
    model_format='keras',
    scaler = scaler,
    features = list(df.drop(target, axis=1).columns),
    result = result,
    encoders = encoders
)