In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load a sample of the dataset (200,000 rows from each month for faster performance)
df_oct = pd.read_csv("2019-Oct.csv", parse_dates=['event_time'], nrows=200000)
df_nov = pd.read_csv("2019-Nov.csv", parse_dates=['event_time'], nrows=200000)
df_all = pd.concat([df_oct, df_nov], ignore_index=True)

# Display basic shape and a sample of the data
print(f"Shape of combined data: {df_all.shape}")
df_all.head()


: 

In [None]:
# Basic information about the dataset
df_all.info()

# Summary statistics for numerical columns
df_all.describe()

# List all columns
df_all.columns


In [None]:
# Check for missing values
missing = df_all.isnull().sum()
missing[missing > 0].sort_values(ascending=False)


In [None]:
# Analyze the distribution of event types (view, cart, purchase)
event_counts = df_all['event_type'].value_counts()

# Plot event type distribution
event_counts.plot(kind='bar', figsize=(8,5))
plt.title("Distribution of Event Types")
plt.xlabel("Event Type")
plt.ylabel("Count")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

# Display event type percentages
print(event_counts / event_counts.sum() * 100)


In [None]:
# Extract hour of day and day of week from event_time
df_all['hour'] = df_all['event_time'].dt.hour
df_all['day_of_week'] = df_all['event_time'].dt.dayofweek


In [None]:
# Plot number of events per hour
plt.figure(figsize=(10,6))
sns.countplot(data=df_all, x='hour', palette='viridis')
plt.title("Events per Hour")
plt.xlabel("Hour of the Day")
plt.ylabel("Count of Events")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()


In [None]:
# Plot number of events per day of the week
plt.figure(figsize=(10,6))
sns.countplot(data=df_all, x='day_of_week', palette='magma')
plt.title("Events per Day of Week")
plt.xlabel("Day of Week (0=Monday)")
plt.ylabel("Count of Events")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()


In [None]:
# Analyze the top 10 most frequent product categories
df_all['category_code'].value_counts().head(10).plot(kind='bar', figsize=(10,5))
plt.title("Top 10 Most Frequent Categories")
plt.xlabel("Category")
plt.ylabel("Count")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.xticks(rotation=45)
plt.show()
