In [59]:
import pandas as pd
import numpy as np
from faker import Faker
from faker.providers import internet
fake = Faker(['en_US'])
fake.add_provider(internet)

In [60]:
interaction_probs = {
    'brand': 0.7,
    'product_type': 0.5,
    'size': 0.9,
    'lens_color': 0.7,
    'price_range': 0.65,
    'prescription_glasses': 0.95,
    'polarized_glasses': 0.6
}

In [61]:
users = pd.read_csv('../data/users.csv')
products = pd.read_csv('../data/product.csv')

In [62]:
users.shape

(3000, 8)

In [63]:
products.head()

Unnamed: 0,Brand,Product_Name,Price,Polarized Glasses,Prescibed Glasses,Frame_Color,Lense_Color,Eye_Size,Item ID
0,Ray-Ban,Aviator RX6489,201.6,No,Yes,Silver,Clear,55,79189032
1,Valley,Tomahawk,260.0,No,No,Crystal with Silver Metal,Black,55,39878751
2,Oakley,Radar EV Path,333.9,No,No,Matte Grey Ink,Prizm Deep Water Polarised,55,39775104
3,Ray-Ban,Jim RB3694,219.6,Yes,No,Havana on Gunmetal,Vintage Brown Black Gradient Glass,55,31485446
4,Oakley,BiSphaera,300.6,No,No,Matte Carbon,Prizm,59,61448395


In [64]:
users.rename(columns={'User ID': 'user_id', 'First Name': 'first_name', 'Last Name': 'last_name', 'Email': 'email', 'Gender': 'gender', 'Postcode': 'postcode', 'Country': 'country'}, inplace=True)
products.rename(columns={'Brand': 'brand', 'Product_Name': 'product_name', 'Price': 'price', 'Polarized Glasses': 'polarized_glasses', 'Lense_Color': 'lens_color', 'Frame_Color': 'frame_color', 'Prescibed Glasses': 'prescribed_glasses', 'Item ID': 'item_id', 'Eye_Size': 'eye_size'}, inplace=True)

In [65]:
products['price_range'] = pd.cut(products['price'], bins=[0, 150, 400, 1000], labels=['simple', 'medium', 'high'])
products.drop(columns=['price'], inplace=True)


In [66]:
products.head()

Unnamed: 0,brand,product_name,polarized_glasses,prescribed_glasses,frame_color,lens_color,eye_size,item_id,price_range
0,Ray-Ban,Aviator RX6489,No,Yes,Silver,Clear,55,79189032,medium
1,Valley,Tomahawk,No,No,Crystal with Silver Metal,Black,55,39878751,medium
2,Oakley,Radar EV Path,No,No,Matte Grey Ink,Prizm Deep Water Polarised,55,39775104,medium
3,Ray-Ban,Jim RB3694,Yes,No,Havana on Gunmetal,Vintage Brown Black Gradient Glass,55,31485446,medium
4,Oakley,BiSphaera,No,No,Matte Carbon,Prizm,59,61448395,medium


In [67]:
products.describe()

Unnamed: 0,eye_size,item_id
count,649.0,649.0
mean,54.701079,53993910.0
std,3.280604,26258490.0
min,48.0,10270840.0
25%,53.0,31478330.0
50%,54.0,52381430.0
75%,57.0,77334390.0
max,62.0,99883680.0


In [68]:
products.head()

Unnamed: 0,brand,product_name,polarized_glasses,prescribed_glasses,frame_color,lens_color,eye_size,item_id,price_range
0,Ray-Ban,Aviator RX6489,No,Yes,Silver,Clear,55,79189032,medium
1,Valley,Tomahawk,No,No,Crystal with Silver Metal,Black,55,39878751,medium
2,Oakley,Radar EV Path,No,No,Matte Grey Ink,Prizm Deep Water Polarised,55,39775104,medium
3,Ray-Ban,Jim RB3694,Yes,No,Havana on Gunmetal,Vintage Brown Black Gradient Glass,55,31485446,medium
4,Oakley,BiSphaera,No,No,Matte Carbon,Prizm,59,61448395,medium


In [69]:
products.price_range.value_counts()

price_range
medium    457
high      175
simple     17
Name: count, dtype: int64

In [41]:
interactions = []
for user in users.itertuples():
    num_interactions = max(0, int(np.random.normal(2, 3)))
    for _ in range(num_interactions):
        product = products.sample(1).iloc[0]
        action = fake.random_element(elements=('purchased', 'added to cart', 'viewed'))
        interactions.append((user.user_id, product.item_id, action))

In [62]:
interactions_df = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'action'])

In [63]:
interactions_df.head()

Unnamed: 0,user_id,item_id,action
0,9182230,16254540,purchased
1,9182230,47809062,viewed
2,9182230,59058335,added to cart
3,9182230,56177215,viewed
4,9182230,30551853,purchased


In [64]:
interactions_df.shape

(6264, 3)

In [32]:
interactions_df[interactions_df['user_id'] == 40769].value_counts().sum()

np.int64(5)

In [33]:
# show the prouct table with item_id as 58991390, 32575973, 79943660, 88600039, 30551853
products[products['item_id'].isin([58991390, 32575973, 79943660, 88600039, 30551853])]


Unnamed: 0,brand,product_name,price,polarized,prescription,frame_color,Lense_Color,eye_size,item_id
189,Prada Linea Rossa,PS06PV,324.9,No,Prescribed Glasses,Black Rubber,Clear,,88600039
307,Dolce & Gabbana,DG4416,334.8,No,Prescribed Sunglasses,Leopard,Dark Grey,,79943660
449,Oakley Youth,Capacitor,133.0,No,Prescribed Sunglasses,Polished Stonewash,Prizm Road,,58991390
564,Coach,HC8392U,215.9,Yes,Prescribed Sunglasses,Black,Grey Gradient,,30551853
638,Michael Kors,Karlie MK2170U,228.6,Yes,Prescribed Sunglasses,Black,Grey Gradient Polarised,,32575973


In [34]:
interactions_df['action'].value_counts()

action
added to cart    2206
viewed           2171
purchased        2155
Name: count, dtype: int64

---

In [70]:
import numpy as np
import pandas as pd
from faker import Faker

# Initialize Faker instance
fake = Faker()

# Define interaction probabilities for product attributes
interaction_probs = {
    'brand': 0.7,
    'product_name': 0.5,
    'eye_size': 0.9,
    'lens_color': 0.7,
    'price_range': 0.65,
    'prescribed_glasses': 0.95,
    'polarized_glasses': 0.6
}

# Define action probabilities
action_probs = {
    'purchased': 0.5,
    'added_to_cart': 0.3,
    'viewed': 0.2
}

# Generate number of interactions per user with a normal distribution, ensuring range [1, 50]
num_users = len(users)
target_interactions = 100000  # Target total interactions

# Generate normal distribution interaction counts (mean=25, std=10), limit between 1 and 50
user_interactions = np.random.normal(loc=25, scale=10, size=num_users)
user_interactions = np.clip(user_interactions, 1, 50).astype(int)  # Keep interactions between 1 and 50

# Rescale the interactions to match the target number of interactions
scaled_interactions = (user_interactions / user_interactions.sum()) * target_interactions
scaled_interactions = np.round(scaled_interactions).astype(int)

# Ensure that after scaling, no user exceeds 50 interactions
scaled_interactions = np.clip(scaled_interactions, 1, 50)

# Initialize dictionary to store previous user preferences
user_preferences = {}

interactions = []

# Product attributes to track for each user
tracked_attributes = ['brand', 'product_name', 'eye_size', 'lens_color', 'price_range', 'polarized_glasses', 'prescribed_glasses']

# Iterate over each user and generate their interactions
for user, num_interactions in zip(users.itertuples(), scaled_interactions):
    # Initialize user preferences if they don't exist
    user_id = user.user_id
    if user_id not in user_preferences:
        user_preferences[user_id] = {attr: None for attr in tracked_attributes}
    
    for _ in range(num_interactions):
        # Sample a product for this interaction (new random product)
        product = products.sample(1).iloc[0]
        
        # For each tracked attribute, decide whether to stick with the previous choice or pick a new one
        selected_attributes = {}
        for attr in tracked_attributes:
            if user_preferences[user_id][attr] and np.random.rand() < interaction_probs[attr]:
                # Stick with the previous choice
                selected_attributes[attr] = user_preferences[user_id][attr]
            else:
                # Choose a new attribute from the product
                selected_attributes[attr] = product[attr]
        
        # Update user preferences with the selected product attributes
        for attr in tracked_attributes:
            user_preferences[user_id][attr] = selected_attributes[attr]
        
        # Use np.random.choice to generate a random action based on probabilities
        action = np.random.choice(
            ['purchased', 'added_to_cart', 'viewed'],
            p=[action_probs['purchased'], action_probs['added_to_cart'], action_probs['viewed']]
        )
        
        # Append the interaction tuple (user_id, item_id, Action) to the interactions list
        interactions.append((user.user_id,
                             product.item_id, 
                             action,
                             selected_attributes['brand'],
                             selected_attributes['product_name'],
                             selected_attributes['eye_size'],
                             selected_attributes['lens_color'],
                             selected_attributes['price_range'],
                             selected_attributes['polarized_glasses'],
                             selected_attributes['prescribed_glasses']))

# Convert to a DataFrame for further processing or storage
interactions_df = pd.DataFrame(interactions, columns=[
    'user_id', 'item_id', 'action', 'brand', 'product_name', 'eye_size', 'lens_color', 'price_range', 'polarized_glasses', 'prescribed_glasses'
])

# Check if interactions are within valid limits
assert interactions_df['user_id'].nunique() == num_users, "Not all users are included!"
assert interactions_df.groupby('user_id').size().max() <= 50, "Some users have more than 50 interactions!"
assert interactions_df.groupby('user_id').size().min() >= 1, "Some users have less than 1 interaction!"

In [71]:
interactions_df.shape

(97814, 10)

In [76]:
interactions_df.user_id.nunique()

3000

In [77]:
interactions_df.user_id.value_counts().min()

np.int64(1)

In [78]:
interactions_df.user_id.value_counts().max()

np.int64(50)

In [79]:
users = pd.read_csv('../data/users.csv')

users.rename(columns={'User ID': 'user_id', 'First Name': 'first_name', 'Last Name': 'last_name', 'Email': 'email', 'Age': 'age', 'Gender': 'gender', 'Postcode': 'postcode', 'Country': 'country'}, inplace=True)

# add the user details to the interactions_df
new_df = users.merge(interactions_df, on='user_id')

In [80]:
# mix the data
save_df = new_df.sample(frac=1).reset_index(drop=True)

In [81]:
save_df.shape

(97814, 17)

In [82]:
save_df.head()

Unnamed: 0,user_id,first_name,last_name,email,age,gender,postcode,country,item_id,action,brand,product_name,eye_size,lens_color,price_range,polarized_glasses,prescribed_glasses
0,8356259,James,Salas,James.Salas.DDS@ross.net,42,Male,3720,Australia,43796976,added_to_cart,Versace,1739/F/SK,56,Clear,medium,No,Yes
1,6612383,Rebecca,Patterson,Rebecca.Patterson@wade.net,61,Male,3059,Australia,96224516,viewed,Prada,PRA08V,50,Clear,medium,No,Yes
2,3844909,Meagan,Kirk,Meagan.Kirk@carey-manning.org,69,Female,2582,Australia,46937700,purchased,Ray-Ban,Kat RX7327,56,Clear,medium,No,Yes
3,3094472,Leah,Mitchell,Leah.Mitchell@fisher.com.au,33,Female,4892,Australia,68796712,added_to_cart,Dolce & Gabbana,PR15WS,57,Clear,medium,No,No
4,6234966,Katie,Richards,Katie.Richards@wilson.biz,20,Female,6323,Australia,63577055,added_to_cart,Valley,VE3338,61,Dark Green Polarised,high,No,Yes


In [83]:
save_df.to_csv('../data/merged_data.csv', index=False)

In [84]:
save_df.item_id.value_counts().sort_values(ascending=False).head()

item_id
68135025    189
72385789    189
94007454    187
61448395    183
63579042    183
Name: count, dtype: int64

---

In [131]:
import pandas as pd
import numpy as np
from faker import Faker

# Define interaction probabilities
interaction_probs = {
    'brand': 0.7,
    'product_name': 0.5,
    'eye_size': 0.9,
    'lens_color': 0.7,
    'price': 0.65,
    'prescription': 0.95,
    'polarized': 0.6
}

interaction_type_probs = ['purchased'] * 60 + ['added_to_cart'] * 30 + ['viewed'] * 10

interactions = []
total_interactions = 100000
user_interactions = np.random.normal(2, 3, len(users))
user_interactions = np.clip(user_interactions, 0, 10).astype(int)

# Adjust the total number of interactions to sum up to 100,000
scaling_factor = total_interactions / user_interactions.sum()
user_interactions = (user_interactions * scaling_factor).astype(int)

# Ensure total interactions are exactly 100,000
while user_interactions.sum() != total_interactions:
    diff = total_interactions - user_interactions.sum()
    indices = np.random.choice(len(user_interactions), abs(diff))
    user_interactions[indices] += np.sign(diff)

# Ensure all user IDs are used
user_interactions = np.clip(user_interactions, 0, 10)
remaining_interactions = total_interactions - user_interactions.sum()
additional_interactions = np.random.choice(len(users), remaining_interactions)
for idx in additional_interactions:
    if user_interactions[idx] < 10:
        user_interactions[idx] += 1

for user, num_interactions in zip(users.itertuples(), user_interactions):
    for _ in range(num_interactions):
        product = products.sample(1).iloc[0]
        action = np.random.choice(interaction_type_probs)
        interactions.append((user.user_id, product.item_id, action))

# Create User Interaction Table
interactions_df = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'Action'])

In [132]:
interactions_df.shape

(30000, 3)

In [1]:
interactions_df.user_id.value_counts().mean()

NameError: name 'interactions_df' is not defined

In [125]:
interactions_df.user_id.nunique()

3000

In [127]:
interactions_df[interactions_df['user_id']==33257].value_counts().sum()

np.int64(10)

In [113]:
users[users['user_id'] == 78804]

Unnamed: 0,user_id,first_name,last_name,email,Age,gender,postcode,country
2361,78804,Dakota,Henry,Dakota.Henry@osborne.edu,67,Female,3685,Australia


In [114]:
products[products['item_id'].isin([24718532,98350457,29419077,74716998,72027396])]

Unnamed: 0,brand,product_name,price,polarized,prescription,frame_color,Lense_Color,eye_size,item_id
167,Oakley,Port Bow OX8164,174.6,No,Prescribed Glasses,Polished Clear,Clear,53 Eye Size,29419077
274,Miu Miu,MU56ZS,538.2,No,Prescribed Sunglasses,Gold,Light Purple Brown,,74716998
364,Ray-Ban,New Round RB3637,232.2,No,Prescribed Sunglasses,Gold,Blue Gradient Glass,53 Eye Size,98350457
432,Oakley,Cables,179.1,No,Prescribed Sunglasses,Blue Steel,Prizm Sapphire,,24718532
532,Ray-Ban,RB3751,196.2,Yes,Prescribed Sunglasses,Black,Green Glass,58 Eye Size,72027396
