In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv('online_shoppers_intention.csv')

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nInfo:")
print(df.info())

print("\nFirst 5 rows:\n", df.head())

print("\nRevenue value counts:")
print(df['Revenue'].value_counts())
print("\nPercentage buyers: {:.2f}%".format(df['Revenue'].mean() * 100))

print("\nMissing values per column:\n", df.isnull().sum())

# Numeric stats
print("\nDescribe numeric columns:\n", df.select_dtypes(include=['float64','int64']).describe())


Shape: (12330, 18)

Columns: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues        

In [5]:
zero_activity = (
    (df['Administrative'] == 0) &
    (df['Informational'] == 0) &
    (df['ProductRelated'] == 0)
)
zero_duration = (
    (df['Administrative_Duration'] == 0) &
    (df['Informational_Duration'] == 0) &
    (df['ProductRelated_Duration'] == 0)
)
df = df[~(zero_activity | zero_duration)].copy()
print("Shape after noise removal:", df.shape)

num_cols = [
    'Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration',
    'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues'
]
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]
print("Shape after outlier removal:", df.shape)

df.to_csv('online_shoppers_cleaned_V4.csv', index=False)
print("Cleaned data saved.")

Shape after noise removal: (11610, 18)
Shape after outlier removal: (5201, 18)
Cleaned data saved.
