In [5]:
import pandas as pd

# Define the path to our data file
file_path = 'ecommerce-behavior-data-from-multi-category-store/2019-Oct.csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows to see the structure
print("First 5 rows of the dataset:")
print(df.head())

# Get a concise summary of the DataFrame
print("\nDataset information:")
df.info(show_counts=True)


First 5 rows of the dataset:
                event_time event_type  product_id          category_id  \
0  2019-10-01 00:00:00 UTC       view    44600062  2103807459595387724   
1  2019-10-01 00:00:00 UTC       view     3900821  2053013552326770905   
2  2019-10-01 00:00:01 UTC       view    17200506  2053013559792632471   
3  2019-10-01 00:00:01 UTC       view     1307067  2053013558920217191   
4  2019-10-01 00:00:04 UTC       view     1004237  2053013555631882655   

                         category_code     brand    price    user_id  \
0                                  NaN  shiseido    35.79  541312140   
1  appliances.environment.water_heater      aqua    33.20  554748717   
2           furniture.living_room.sofa       NaN   543.10  519107250   
3                   computers.notebook    lenovo   251.74  550050854   
4               electronics.smartphone     apple  1081.98  535871217   

                           user_session  
0  72d76fde-8bb3-4e00-8c23-a032dfed738c  
1  9333df

In [4]:
# --- 1. Drop rows with missing user_session ---
df.dropna(subset=['user_session'], inplace=True)

# --- 2. Fill missing category_code and brand ---
# This explicitly assigns the modified column back to the DataFrame.
df['category_code'] = df['category_code'].fillna('unknown')
df['brand'] = df['brand'].fillna('unknown')

# --- 3. Optimize Data Types ---
df['event_time'] = pd.to_datetime(df['event_time'])
df['event_type'] = df['event_type'].astype('category')
df['category_code'] = df['category_code'].astype('category')
df['brand'] = df['brand'].astype('category')

# --- 4. Verify the changes ---
print("Cleaned dataset information:")
df.info(show_counts=True)

Cleaned dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 42448762 entries, 0 to 42448763
Data columns (total 9 columns):
 #   Column         Non-Null Count     Dtype              
---  ------         --------------     -----              
 0   event_time     42448762 non-null  datetime64[ns, UTC]
 1   event_type     42448762 non-null  category           
 2   product_id     42448762 non-null  int64              
 3   category_id    42448762 non-null  int64              
 4   category_code  42448762 non-null  category           
 5   brand          42448762 non-null  category           
 6   price          42448762 non-null  float64            
 7   user_id        42448762 non-null  int64              
 8   user_session   42448762 non-null  object             
dtypes: category(3), datetime64[ns, UTC](1), float64(1), int64(3), object(1)
memory usage: 2.4+ GB


In [5]:
# First, sort the entire DataFrame. This is crucial for getting the correct event order.
print("Sorting events by session and time...")
df.sort_values(by=['user_session', 'event_time'], inplace=True)

# Group by 'user_session' and aggregate the 'product_id's into a list
print("Grouping events into sessions...")
sessions = df.groupby('user_session')['product_id'].apply(list)

print("\nHere are the first 5 sessions we created:")
print(sessions.head())

# Let's see how many unique sessions we have
print(f"\nTotal unique sessions in October: {sessions.shape[0]:,}")

Sorting events by session and time...
Grouping events into sessions...

Here are the first 5 sessions we created:
user_session
00000042-3e3f-42f9-810d-f3d264139c50                                 [54900011, 54900011]
00000056-a206-40dd-b174-a072550fa38c    [1005115, 1005105, 1005105, 5100816, 1004858, ...
00000083-8816-4d58-a9b8-f52f54186edc    [1004768, 1005098, 1005073, 1004871, 1004751, ...
000001fd-1f89-45e8-a3ce-fe3218cabfad    [1004856, 1004856, 1004863, 1004834, 1004834, ...
000003eb-b63e-45d9-9f26-f229057c654a                                            [2501061]
Name: product_id, dtype: object

Total unique sessions in October: 9,244,421


In [6]:
# Function to remove consecutive duplicates
def remove_consecutive_duplicates(item_list):
    if not item_list:
        return []
    # Create a new list, adding items only if they are different from the last item added
    new_list = [item_list[0]]
    for item in item_list[1:]:
        if item != new_list[-1]:
            new_list.append(item)
    return new_list

print("Cleaning sessions by removing consecutive duplicates...")
cleaned_sessions = sessions.apply(remove_consecutive_duplicates)

# Filter out sessions that are now too short (fewer than 2 items)
print("Filtering out short sessions...")
final_sessions = cleaned_sessions[cleaned_sessions.apply(len) >= 2]


print(f"\nOriginal number of sessions: {len(sessions):,}")
print(f"Number of sessions after cleaning: {len(final_sessions):,}")
print("\nHere are the first 5 cleaned sessions:")
print(final_sessions.head())

Cleaning sessions by removing consecutive duplicates...
Filtering out short sessions...

Original number of sessions: 9,244,421
Number of sessions after cleaning: 4,619,239

Here are the first 5 cleaned sessions:
user_session
00000056-a206-40dd-b174-a072550fa38c    [1005115, 1005105, 5100816, 1004858, 1005104, ...
00000083-8816-4d58-a9b8-f52f54186edc    [1004768, 1005098, 1005073, 1004871, 1004751, ...
000001fd-1f89-45e8-a3ce-fe3218cabfad                          [1004856, 1004863, 1004834]
0000047e-bdcc-4854-9e8d-9da7f84010ae                                   [2701673, 2701773]
00000809-9101-4e4b-9795-e6cbafccfe19                 [2900090, 2900802, 2900803, 2900802]
Name: product_id, dtype: object


In [7]:
# Save the processed sessions to a file for easy access later
final_sessions.to_csv('october_sessions_cleaned.csv')

print("\nCleaned sessions have been saved to 'october_sessions_cleaned.csv'")


Cleaned sessions have been saved to 'october_sessions_cleaned.csv'


In [11]:
import pandas as pd
from collections import defaultdict
import ast
import pickle

# --- Re-run the entire process with the fix ---

# Load our cleaned sessions
print("Loading cleaned sessions...")
df_sessions_df = pd.read_csv('october_sessions_cleaned.csv', index_col=0)
df_sessions = df_sessions_df.iloc[:, 0]
df_sessions = df_sessions.apply(ast.literal_eval)

# Define a named function to create the nested defaultdict
def nested_dd():
    return defaultdict(int)

# Use the named function instead of a lambda
co_occurrence_matrix = defaultdict(nested_dd)

print("Building co-occurrence matrix...")
for session in df_sessions:
    for i in range(len(session) - 1):
        current_item = session[i]
        next_item = session[i+1]
        co_occurrence_matrix[current_item][next_item] += 1

print("Co-occurrence matrix built successfully!")

# --- Now, save the model ---
print("Saving the model...")
with open('co_occurrence_model.pkl', 'wb') as f:
    pickle.dump(co_occurrence_matrix, f)

print("Model has been saved successfully to 'co_occurrence_model.pkl'")

Loading cleaned sessions...
Building co-occurrence matrix...
Co-occurrence matrix built successfully!
Saving the model...
Model has been saved successfully to 'co_occurrence_model.pkl'


In [12]:
import pickle

# Save the co-occurrence matrix to a file
with open('co_occurrence_model.pkl', 'wb') as f:
    pickle.dump(co_occurrence_matrix, f)

print("Model (co-occurrence matrix) has been saved to 'co_occurrence_model.pkl'")

Model (co-occurrence matrix) has been saved to 'co_occurrence_model.pkl'


In [6]:
from gensim.models import Word2Vec
import pandas as pd
import ast

# Load your cleaned sessions again
print("Loading cleaned sessions...")

# Load the data into a DataFrame
df_sessions_df = pd.read_csv('october_sessions_cleaned.csv', index_col=0)

# Select the first (and only) column to create the Series
df_sessions = df_sessions_df.iloc[:, 0]

# Convert it to a list
sessions_as_lists = df_sessions.apply(ast.literal_eval)

# The model needs the product_ids to be strings
print("Converting product IDs to strings...")
sessions_as_strings = [[str(item) for item in session] for session in sessions_as_lists]

# --- Train the Word2Vec Model ---
print("Training Word2Vec model... (This will take a few minutes)")
w2v_model = Word2Vec(
    sentences=sessions_as_strings,
    vector_size=100,  # The dimensionality of the product vectors
    window=5,         # Max distance between current and predicted product within a session
    min_count=5,      # Ignores all products with total frequency lower than this
    workers=4         # Use 4 worker threads to train the model
)
print("Model training complete.")

# --- Save the new model ---
w2v_model.save("product_w2v.model")
print("Word2Vec model saved to 'product_w2v.model'")

# --- Let's Test It ---
# Get an example item from our first session
example_item_id = sessions_as_strings[0][0]

# Find the top 5 most similar items
print(f"\nTop 5 most similar items to {example_item_id}:")
similar_items = w2v_model.wv.most_similar(example_item_id, topn=5)
print(similar_items)

Loading cleaned sessions...
Converting product IDs to strings...
Training Word2Vec model... (This will take a few minutes)
Model training complete.
Word2Vec model saved to 'product_w2v.model'

Top 5 most similar items to 1005115:
[('1005135', 0.8431757092475891), ('1005104', 0.8268886208534241), ('1004249', 0.81839519739151), ('1003317', 0.8180369138717651), ('1004258', 0.8162089586257935)]
