In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("online_shoppers_intention.csv")
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


# Check for missing values


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

# Check for unique values


In [4]:
categorical_columns = ['Month', 'VisitorType', 'Weekend', 'Revenue']

for column in categorical_columns:
    print(f"{column}: {data[column].value_counts()}")
    print("")

Month: Month
May     3364
Nov     2998
Mar     1907
Dec     1727
Oct      549
Sep      448
Aug      433
Jul      432
June     288
Feb      184
Name: count, dtype: int64

VisitorType: VisitorType
Returning_Visitor    10551
New_Visitor           1694
Other                   85
Name: count, dtype: int64

Weekend: Weekend
False    9462
True     2868
Name: count, dtype: int64

Revenue: Revenue
False    10422
True      1908
Name: count, dtype: int64



# Apply oridinal encoding


In [5]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

categorical_columns = ['Month', 'VisitorType', 'Weekend', 'Revenue']
data_encoded = ordinal_encoder.fit_transform(data[categorical_columns])
X_encoded = pd.DataFrame(data_encoded, columns=categorical_columns)
X_encoded.head()

Unnamed: 0,Month,VisitorType,Weekend,Revenue
0,2.0,2.0,0.0,0.0
1,2.0,2.0,0.0,0.0
2,2.0,2.0,0.0,0.0
3,2.0,2.0,0.0,0.0
4,2.0,2.0,1.0,0.0


## Apply scaling

In [6]:
from sklearn.preprocessing import RobustScaler

numerical_columns = ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
scaler = RobustScaler()

data_scaled = scaler.fit_transform(data[numerical_columns])
X_scaled = pd.DataFrame(data_scaled, columns=numerical_columns)
data_scaled_encoded = pd.concat([X_scaled, X_encoded], axis=1)
data_scaled_encoded.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,VisitorType,Weekend,Revenue
0,-0.25,-0.080424,0.0,0.0,-0.548387,-0.467912,11.710742,4.895621,0.0,0.0,2.0,2.0,0.0,0.0
1,-0.25,-0.080424,0.0,0.0,-0.516129,-0.417913,-0.185128,2.095621,0.0,0.0,2.0,2.0,0.0,0.0
2,-0.25,-0.080424,0.0,0.0,-0.548387,-0.467912,11.710742,4.895621,0.0,0.0,2.0,2.0,0.0,0.0
3,-0.25,-0.080424,0.0,0.0,-0.516129,-0.465829,2.78884,3.215621,0.0,0.0,2.0,2.0,0.0,0.0
4,-0.25,-0.080424,0.0,0.0,-0.258065,0.022315,1.004459,0.695621,0.0,0.0,2.0,2.0,1.0,0.0


## Feature Engineering using Wrapper Method

In [7]:
# from sklearn.neighbors import KNeighborsClassifier as knn
# from sklearn.linear_model import LogisticRegression as LGR
from sklearn.svm import SVC
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection
X = data_scaled_encoded.iloc[:,0:-1]
y = data_scaled_encoded["Revenue"]

In [8]:
efs = ExhaustiveFeatureSelector(
           SVC(kernel='linear'),
           min_features=1, 
           max_features=13, 
           scoring='accuracy',  # sklearn classifiers,
           n_jobs=15,
           cv=3)

In [9]:
efs = efs.fit(X, y)

KeyboardInterrupt: 

In [47]:
efs.best_feature_names_

('BounceRates', 'PageValues', 'Month', 'VisitorType')

In [48]:
pd.DataFrame.from_dict(efs.get_metric_dict()).T
# df = df[["feature_idx","avg_score"]]
# df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
0,"(0,)","[0.8447688564476885, 0.8413625304136253, 0.845...",0.843715,"(Administrative,)",0.003749,0.001666,0.001178
1,"(1,)","[0.8454987834549879, 0.8437956204379562, 0.844...",0.844444,"(Administrative_Duration,)",0.001693,0.000752,0.000532
2,"(2,)","[0.8447688564476885, 0.8447688564476885, 0.844...",0.844688,"(Informational,)",0.000258,0.000115,0.000081
3,"(3,)","[0.8452554744525548, 0.8435523114355231, 0.844...",0.844363,"(Informational_Duration,)",0.00157,0.000698,0.000493
4,"(4,)","[0.8442822384428224, 0.8425790754257908, 0.844...",0.843796,"(ProductRelated,)",0.001949,0.000866,0.000612
...,...,...,...,...,...,...,...
8186,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.8907542579075426, 0.8819951338199513, 0.872...",0.881914,"(Administrative, Administrative_Duration, Info...",0.016318,0.007251,0.005127
8187,"(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.8907542579075426, 0.8812652068126521, 0.873...",0.881752,"(Administrative, Administrative_Duration, Info...",0.016113,0.00716,0.005063
8188,"(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.8888077858880779, 0.881021897810219, 0.8727...",0.88086,"(Administrative, Informational, Informational_...",0.014755,0.006557,0.004636
8189,"(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)","[0.8897810218978102, 0.8807785888077859, 0.872...",0.880941,"(Administrative_Duration, Informational, Infor...",0.016096,0.007153,0.005058


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, RFECV

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_scaled_encoded.drop('Revenue',axis=1), data_scaled_encoded.Revenue, test_size=0.33, random_state=42)

In [16]:
estimator = SVC(kernel='linear')
rfe_select = RFE(estimator, step=1).fit(X_train,y_train)

In [25]:
important_features = pd.DataFrame(list(zip(X_train.columns,rfe_select.support_)),columns=['features','important'])
important_features.sort_values(by='important',ascending=False)

Unnamed: 0,features,important
0,Administrative,True
1,Administrative_Duration,True
7,ExitRates,True
8,PageValues,True
9,SpecialDay,True
11,VisitorType,True
2,Informational,False
3,Informational_Duration,False
4,ProductRelated,False
5,ProductRelated_Duration,False


In [14]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [15]:
X = data_scaled_encoded.iloc[:,0:-1]
y = data_scaled_encoded["Revenue"]

In [16]:
sfs1 = SFS(LGR(max_iter=4000),
           k_features='best', 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy', 
           cv=0)

sfs1 = sfs1.fit(X, y)

: 

: 

In [29]:
sfs1.k_feature_names_, sfs1.k_feature_idx_

(('BounceRates', 'ExitRates', 'PageValues', 'Month', 'VisitorType', 'Weekend'),
 (6, 7, 8, 10, 11, 12))

In [34]:
sbs = SFS(LGR(max_iter=4000),
           k_features='best', 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',  # sklearn classifiers
           cv=3)
sbs = sbs.fit(X, y)


[2024-12-28 14:28:27] Features: 12/1 -- score: 0.8826439578264397
[2024-12-28 14:28:47] Features: 11/1 -- score: 0.8828061638280617
[2024-12-28 14:29:08] Features: 10/1 -- score: 0.8827250608272506
[2024-12-28 14:29:10] Features: 9/1 -- score: 0.8829683698296837
[2024-12-28 14:29:13] Features: 8/1 -- score: 0.88345498783455
[2024-12-28 14:29:14] Features: 7/1 -- score: 0.8835360908353609
[2024-12-28 14:29:16] Features: 6/1 -- score: 0.883779399837794
[2024-12-28 14:29:17] Features: 5/1 -- score: 0.883698296836983
[2024-12-28 14:29:17] Features: 4/1 -- score: 0.883779399837794
[2024-12-28 14:29:18] Features: 3/1 -- score: 0.8833738848337388
[2024-12-28 14:29:18] Features: 2/1 -- score: 0.8824006488240065
[2024-12-28 14:29:18] Features: 1/1 -- score: 0.8801297648012976

In [None]:
sbs.k_feature_names_, sbs.k_feature_idx_