In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
shoppers_data = pd.read_csv("online_shoppers_intention.csv")

In [2]:
# Display basic information about the dataset
print("\nFirst 5 rows:")
print(shoppers_data.head())


First 5 rows:
   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0               1                 0.000000   
3                     0.0               2                 2.666667   
4                     0.0              10               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0         0.20       0.20         0.0         0.0   Feb                 1   
1         0.00       0.10         0.0      

In [3]:
print(shoppers_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [4]:
print(shoppers_data.describe())

       Administrative  Administrative_Duration  Informational  \
count    12330.000000             12330.000000   12330.000000   
mean         2.315166                80.818611       0.503569   
std          3.321784               176.779107       1.270156   
min          0.000000                 0.000000       0.000000   
25%          0.000000                 0.000000       0.000000   
50%          1.000000                 7.500000       0.000000   
75%          4.000000                93.256250       0.000000   
max         27.000000              3398.750000      24.000000   

       Informational_Duration  ProductRelated  ProductRelated_Duration  \
count            12330.000000    12330.000000             12330.000000   
mean                34.472398       31.731468              1194.746220   
std                140.749294       44.475503              1913.669288   
min                  0.000000        0.000000                 0.000000   
25%                  0.000000        7.00000

In [5]:
print(shoppers_data.isnull().sum())

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64


In [6]:
print("\nDataset Shape:", shoppers_data.shape)


Dataset Shape: (12330, 18)


In [7]:
print("\nData types:")
print(shoppers_data.dtypes)


Data types:
Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object


In [None]:
# 2. Check for missing values , there is no missing values in the dataset since the sum of missing values is 0 and as seen in the output of the info() method
print("\nMissing values per column:")
missing_values = shoppers_data.isnull().sum()
print(missing_values)
print(f"Total missing values: {missing_values.sum()}")



Missing values per column:
Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64
Total missing values: 0


In [9]:
# 3. Check for duplicates
duplicates = shoppers_data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    shoppers_data = shoppers_data.drop_duplicates().reset_index(drop=True)
    print(f"Removed {duplicates} duplicate rows. New shape: {shoppers_data.shape}")




Number of duplicate rows: 125
Removed 125 duplicate rows. New shape: (12205, 18)


In [None]:
# 4. Handle missing values (if any) , since the dataset has no missing values, this step is not necessary. However, if there were missing values, we could handle them as follows:
# For numeric columns - replace with median
numeric_cols = shoppers_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_cols:
    if shoppers_data[col].isnull().sum() > 0:
        median_value = shoppers_data[col].median()
        shoppers_data[col] = shoppers_data[col].fillna(median_value)
        print(f"Filled missing values in {col} with median: {median_value}")

# For categorical columns - replace with mode , since the dataset has no missing values, this step is not necessary. However, if there were missing values, we could handle them as follows:
categorical_cols = shoppers_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if shoppers_data[col].isnull().sum() > 0:
        mode_value = shoppers_data[col].mode()[0]
        shoppers_data[col] = shoppers_data[col].fillna(mode_value)
        print(f"Filled missing values in {col} with mode: {mode_value}")

In [None]:
# 5. Data Transformations and Feature Engineering

# 5.1 Convert 'Month' to numerical for analysis
month_map = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'June': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
shoppers_data['Month_Num'] = shoppers_data['Month'].map(month_map)

# 5.2 Create a season feature
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

shoppers_data['Season'] = shoppers_data['Month_Num'].apply(get_season)

# 5.3 Calculate total time on site
shoppers_data['Total_Duration'] = (
    shoppers_data['Administrative_Duration'] + 
    shoppers_data['Informational_Duration'] + 
    shoppers_data['ProductRelated_Duration']
)

# 5.4 Calculate time per page for each page type
shoppers_data['Admin_Time_Per_Page'] = shoppers_data.apply(
    lambda x: x['Administrative_Duration'] / x['Administrative'] if x['Administrative'] > 0 else 0, 
    axis=1
)
shoppers_data['Info_Time_Per_Page'] = shoppers_data.apply(
    lambda x: x['Informational_Duration'] / x['Informational'] if x['Informational'] > 0 else 0, 
    axis=1
)
shoppers_data['Product_Time_Per_Page'] = shoppers_data.apply(
    lambda x: x['ProductRelated_Duration'] / x['ProductRelated'] if x['ProductRelated'] > 0 else 0, 
    axis=1
)

# 5.5 Calculate total pages viewed
shoppers_data['Total_Pages_Viewed'] = (
    shoppers_data['Administrative'] + 
    shoppers_data['Informational'] + 
    shoppers_data['ProductRelated']
)

# 5.6 Create an engagement score
shoppers_data['Engagement_Score'] = (
    shoppers_data['Total_Duration'] * 0.4 + 
    shoppers_data['Total_Pages_Viewed'] * 0.4 + 
    (1 - shoppers_data['BounceRates']) * 0.1 + 
    (1 - shoppers_data['ExitRates']) * 0.1
)

# 5.7 Calculate pages per minute (browsing intensity)
shoppers_data['Pages_Per_Minute'] = shoppers_data.apply(
    lambda x: x['Total_Pages_Viewed'] / (x['Total_Duration']/60) if x['Total_Duration'] > 0 else 0,
    axis=1
)

# 5.8 Create bounce rate categories
def categorize_bounce(rate):
    if rate < 0.2:
        return 'Low'
    elif rate < 0.6:
        return 'Medium'
    else:
        return 'High'
        
shoppers_data['Bounce_Category'] = shoppers_data['BounceRates'].apply(categorize_bounce)

# 5.9 Create exit rate categories
def categorize_exit(rate):
    if rate < 0.2:
        return 'Low'
    elif rate < 0.6:
        return 'Medium'
    else:
        return 'High'
        
shoppers_data['Exit_Category'] = shoppers_data['ExitRates'].apply(categorize_exit)

# 5.10 Encode visitor type as binary features
shoppers_data['Is_Returning'] = shoppers_data['VisitorType'].apply(
    lambda x: 1 if x == 'Returning_Visitor' else 0
)
shoppers_data['Is_New'] = shoppers_data['VisitorType'].apply(
    lambda x: 1 if x == 'New_Visitor' else 0
)

# 5.11 Normalize numeric features using min-max scaling
# Define columns to normalize
numeric_features = [
    'Administrative', 'Administrative_Duration', 
    'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration', 
    'BounceRates', 'ExitRates', 'PageValues',
    'Total_Duration', 'Total_Pages_Viewed',
    'Engagement_Score', 'Pages_Per_Minute'
]

# Apply min-max normalization
for col in numeric_features:
    col_min = shoppers_data[col].min()
    col_max = shoppers_data[col].max()
    # Avoid division by zero
    if col_max - col_min > 0:
        shoppers_data[f'{col}_Norm'] = (shoppers_data[col] - col_min) / (col_max - col_min)
    else:
        shoppers_data[f'{col}_Norm'] = 0
        print(f"Warning: {col} has min=max, normalization set to 0")

# 5.12 Create a binary indicator for weekend
shoppers_data['Is_Weekend'] = shoppers_data['Weekend'].astype(int)

In [12]:
# 6. Check the transformed data
print("\nAfter transformations, dataset shape:", shoppers_data.shape)
print("\nNew features added:")
new_features = [
    'Month_Num', 'Season', 'Total_Duration', 'Admin_Time_Per_Page', 
    'Info_Time_Per_Page', 'Product_Time_Per_Page', 'Total_Pages_Viewed',
    'Engagement_Score', 'Pages_Per_Minute', 'Bounce_Category', 'Exit_Category',
    'Is_Returning', 'Is_New', 'Is_Weekend'
]
print(", ".join(new_features))


After transformations, dataset shape: (12205, 45)

New features added:
Month_Num, Season, Total_Duration, Admin_Time_Per_Page, Info_Time_Per_Page, Product_Time_Per_Page, Total_Pages_Viewed, Engagement_Score, Pages_Per_Minute, Bounce_Category, Exit_Category, Is_Returning, Is_New, Is_Weekend


In [13]:
# 7. Look for outliers in key metrics
print("\nChecking for outliers in key metrics:")
key_metrics = ['Total_Duration', 'Total_Pages_Viewed', 'Engagement_Score']
for col in key_metrics:
    Q1 = shoppers_data[col].quantile(0.25)
    Q3 = shoppers_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = shoppers_data[(shoppers_data[col] < lower_bound) | (shoppers_data[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers detected ({len(outliers)/len(shoppers_data)*100:.2f}%)")


Checking for outliers in key metrics:
Total_Duration: 922 outliers detected (7.55%)
Total_Pages_Viewed: 971 outliers detected (7.96%)
Engagement_Score: 918 outliers detected (7.52%)


In [None]:
# 8. Saving the processed dataset
shoppers_data.to_csv('processed_online_shoppers_intention.csv', index=False)
print("\nProcessed data saved to 'processed_online_shoppers_intention.csv'")


Processed data saved to 'processed_online_shoppers_intention.csv'


In [15]:
# 9. Show summary statistics of the processed data
print("\nSummary statistics of key features:")
print(shoppers_data[key_metrics].describe())


Summary statistics of key features:
       Total_Duration  Total_Pages_Viewed  Engagement_Score
count    12205.000000        12205.000000      12205.000000
mean      1323.454242           34.893240        543.532809
std       2043.871589           46.627336        833.585514
min          0.000000            0.000000          0.160000
25%        231.666667            9.000000         97.385714
50%        690.958333           20.000000        285.665128
75%       1643.958333           42.000000        675.981531
max      69921.647230          746.000000      28152.856045


In [None]:
# 10. Summary of the preparation steps of the dataset 
print("\nData Preparation Summary:")
print(f"- Initial dataset: {shoppers_data.shape[0]} rows, {shoppers_data.shape[1] - len(new_features)} columns")
print(f"- Missing values handled: {missing_values.sum()}")
print(f"- Duplicates removed: {duplicates}")
print(f"- New features created: {len(new_features)}")
print(f"- Final dataset: {shoppers_data.shape[0]} rows, {shoppers_data.shape[1]} columns")


Data Preparation Summary:
- Initial dataset: 12205 rows, 31 columns
- Missing values handled: 0
- Duplicates removed: 125
- New features created: 14
- Final dataset: 12205 rows, 45 columns
