In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("online_shoppers_intention.csv")
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


# Check for missing values


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

# Check for unique values


In [4]:
categorical_columns = ['Month', 'VisitorType', 'Weekend', 'Revenue']

for column in categorical_columns:
    print(f"{column}: {data[column].value_counts()}")
    print("")

Month: Month
May     3364
Nov     2998
Mar     1907
Dec     1727
Oct      549
Sep      448
Aug      433
Jul      432
June     288
Feb      184
Name: count, dtype: int64

VisitorType: VisitorType
Returning_Visitor    10551
New_Visitor           1694
Other                   85
Name: count, dtype: int64

Weekend: Weekend
False    9462
True     2868
Name: count, dtype: int64

Revenue: Revenue
False    10422
True      1908
Name: count, dtype: int64



# Apply oridinal encoding


In [5]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

categorical_columns = ['Month', 'VisitorType', 'Weekend', 'Revenue']
data_encoded = ordinal_encoder.fit_transform(data[categorical_columns])
X_encoded = pd.DataFrame(data_encoded, columns=categorical_columns)
X_encoded.head()

Unnamed: 0,Month,VisitorType,Weekend,Revenue
0,2.0,2.0,0.0,0.0
1,2.0,2.0,0.0,0.0
2,2.0,2.0,0.0,0.0
3,2.0,2.0,0.0,0.0
4,2.0,2.0,1.0,0.0


## Apply scaling

In [6]:
from sklearn.preprocessing import RobustScaler

numerical_columns = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
scaler = RobustScaler()

data_scaled = scaler.fit_transform(data[numerical_columns])
X_scaled = pd.DataFrame(data_scaled, columns=numerical_columns)
data_scaled_encoded = pd.concat([X_scaled, X_encoded], axis=1)
data_scaled_encoded.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,VisitorType,Weekend,Revenue
0,-0.25,-0.080424,0.0,0.0,-0.548387,-0.467912,11.710742,4.895621,0.0,0.0,2.0,2.0,0.0,0.0
1,-0.25,-0.080424,0.0,0.0,-0.516129,-0.417913,-0.185128,2.095621,0.0,0.0,2.0,2.0,0.0,0.0
2,-0.25,-0.080424,0.0,0.0,-0.548387,-0.467912,11.710742,4.895621,0.0,0.0,2.0,2.0,0.0,0.0
3,-0.25,-0.080424,0.0,0.0,-0.516129,-0.465829,2.78884,3.215621,0.0,0.0,2.0,2.0,0.0,0.0
4,-0.25,-0.080424,0.0,0.0,-0.258065,0.022315,1.004459,0.695621,0.0,0.0,2.0,2.0,1.0,0.0


## Feature Engineering using Wrapper Method

In [7]:
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.svm import SVC
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection
from joblib import parallel_backend
X = data_scaled_encoded.iloc[:,0:-1]
y = data_scaled_encoded["Revenue"]

### Exhaustive Feature Selection

In [11]:
efs = ExhaustiveFeatureSelector(
            SVC(kernel='linear'),
            # LGR(max_iter=4000),
            min_features=1, 
            max_features=13, 
            scoring='accuracy',
            n_jobs=15,
            cv=3)

In [None]:
efs = efs.fit(X, y)

In [47]:
efs.best_feature_names_

('BounceRates', 'PageValues', 'Month', 'VisitorType')

### Recursive Feature Elimination

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE, RFECV

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data_scaled_encoded.drop('Revenue',axis=1), data_scaled_encoded.Revenue, test_size=0.33, random_state=42)

In [26]:
estimator = SVC(kernel='linear')
rfe_select = RFE(estimator, step=1).fit(X_train,y_train)

In [21]:
important_features = pd.DataFrame(list(zip(X_train.columns,rfe_select.support_)),columns=['features','important'])
important_features.sort_values(by='important',ascending=False)

Unnamed: 0,features,important
0,Administrative,True
1,Administrative_Duration,True
6,BounceRates,True
8,PageValues,True
9,SpecialDay,True
11,VisitorType,True
2,Informational,False
3,Informational_Duration,False
4,ProductRelated,False
5,ProductRelated_Duration,False


In [14]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [15]:
X = data_scaled_encoded.iloc[:,0:-1]
y = data_scaled_encoded["Revenue"]

### Forward Feature Selection

In [16]:
sfs1 = SFS(SVC(kernel='linear'),
        #   LGR(max_iter=4000), 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy', 
           cv=0)

sfs1 = sfs1.fit(X, y)


[2025-01-02 02:45:10] Features: 1/13 -- score: 0.883617193836172
[2025-01-02 02:45:10] Features: 2/13 -- score: 0.8850770478507705
[2025-01-02 02:45:11] Features: 3/13 -- score: 0.8850770478507705
[2025-01-02 02:45:11] Features: 4/13 -- score: 0.8855636658556366
[2025-01-02 02:45:11] Features: 5/13 -- score: 0.8861313868613139
[2025-01-02 02:45:12] Features: 6/13 -- score: 0.8862124898621249
[2025-01-02 02:45:12] Features: 7/13 -- score: 0.8862124898621249
[2025-01-02 02:45:13] Features: 8/13 -- score: 0.8854825628548256
[2025-01-02 02:45:13] Features: 9/13 -- score: 0.8854825628548256
[2025-01-02 02:45:14] Features: 10/13 -- score: 0.8851581508515816
[2025-01-02 02:45:15] Features: 11/13 -- score: 0.8848337388483374
[2025-01-02 02:45:16] Features: 12/13 -- score: 0.8843471208434712
[2025-01-02 02:45:16] Features: 13/13 -- score: 0.8841038118410381

In [17]:
sfs1.k_feature_names_, sfs1.k_feature_idx_

(('BounceRates', 'ExitRates', 'PageValues', 'Month', 'VisitorType', 'Weekend'),
 (6, 7, 8, 10, 11, 12))

### Backward Feature Selection

In [20]:
sbs = SFS(SVC(kernel='linear'),
        #   LGR(max_iter=4000),
          k_features='best', 
          forward=False, 
          floating=False, 
          verbose=2,
          scoring='accuracy',  
          cv=3)
sbs = sbs.fit(X, y)

In [19]:
sbs.k_feature_names_, sbs.k_feature_idx_

(('Administrative_Duration',
  'Informational',
  'ExitRates',
  'PageValues',
  'Month',
  'VisitorType'),
 (1, 2, 7, 8, 10, 11))