# **Feature Engineering**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('..\Data\cleaned_session_data.csv')

In [3]:
df.head()

Unnamed: 0,SessionID,AdClicks,InfoSectionCount,InfoSectionTime,HelpPageVisits,HelpPageTime,ItemBrowseCount,ItemBrowseTime,ExitRateFirstPage,SessionExitRatio,PageEngagementScore,HolidayProximityIndex,VisitMonth,UserPlatformID,WebClientCode,MarketZone,TrafficSourceCode,UserCategory,IsWeekendVisit,MonetaryConversion
0,jv3uDyqYltpoXreO,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,February,Windows,Safari,North America,1.0,Returning,0,0
1,eBHG5YGyco8TRlac,4.0,0.0,0.0,0.0,0.0,2.0,64.0,0.0,0.1,0.0,0.0,February,Android,Chrome,North America,2.0,Returning,0,0
2,Lz5fuqr91SKGHb5N,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,February,MacOS,Safari,Other,3.0,Returning,0,0
3,cyOsumxkpNEvKIdc,4.0,0.0,0.0,0.0,0.0,2.0,2.666667,0.05,0.14,0.0,0.0,February,iOS,Chrome,Europe,4.0,Returning,0,0
4,F4rTMKOqtoPKzxw9,2.0,0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.061806,February,iOS,Internet Explorer,North America,4.0,Returning,1,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   SessionID              12330 non-null  object 
 1   AdClicks               12330 non-null  float64
 2   InfoSectionCount       12330 non-null  float64
 3   InfoSectionTime        12330 non-null  float64
 4   HelpPageVisits         12330 non-null  float64
 5   HelpPageTime           12330 non-null  float64
 6   ItemBrowseCount        12330 non-null  float64
 7   ItemBrowseTime         12330 non-null  float64
 8   ExitRateFirstPage      12330 non-null  float64
 9   SessionExitRatio       12330 non-null  float64
 10  PageEngagementScore    12330 non-null  float64
 11  HolidayProximityIndex  12330 non-null  float64
 12  VisitMonth             12330 non-null  object 
 13  UserPlatformID         12330 non-null  object 
 14  WebClientCode          12330 non-null  object 
 15  Ma

### **Target variable for classification**

In [5]:
TARGET = 'MonetaryConversion'

### **Ratio Features**
#### **These can capture engagement intensity or efficiency**

In [6]:
df['AdClicksPerItemBrowse'] = df['AdClicks'] / (df['ItemBrowseCount'] + 1e-6)
df['InfoTimePerInfoSection'] = df['InfoSectionTime'] / (df['InfoSectionCount'] + 1e-6)
df['HelpTimePerHelpVisit'] = df['HelpPageTime'] / (df['HelpPageVisits'] + 1e-6)
df['EngagementPerItemBrowse'] = df['PageEngagementScore'] / (df['ItemBrowseCount'] + 1e-6)
df['AvgItemBrowseTime'] = df['ItemBrowseTime'] / (df['ItemBrowseCount'] + 1e-6)

### **Interaction Features**
#### **E.g., combine elements of platform and client to see common pairs**

In [7]:
df['Platform_Client'] = df['UserPlatformID'].astype(str) + '_' + df['WebClientCode'].astype(str)
df['Market_TrafficSource'] = df['MarketZone'].astype(str) + '_' + df['TrafficSourceCode'].astype(str)

### **Engineered features head**

In [8]:
df.head()

Unnamed: 0,SessionID,AdClicks,InfoSectionCount,InfoSectionTime,HelpPageVisits,HelpPageTime,ItemBrowseCount,ItemBrowseTime,ExitRateFirstPage,SessionExitRatio,...,UserCategory,IsWeekendVisit,MonetaryConversion,AdClicksPerItemBrowse,InfoTimePerInfoSection,HelpTimePerHelpVisit,EngagementPerItemBrowse,AvgItemBrowseTime,Platform_Client,Market_TrafficSource
0,jv3uDyqYltpoXreO,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,...,Returning,0,0,0.0,0.0,0.0,0.0,0.0,Windows_Safari,North America_1.0
1,eBHG5YGyco8TRlac,4.0,0.0,0.0,0.0,0.0,2.0,64.0,0.0,0.1,...,Returning,0,0,1.999999,0.0,0.0,0.0,31.999984,Android_Chrome,North America_2.0
2,Lz5fuqr91SKGHb5N,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,...,Returning,0,0,0.0,0.0,0.0,0.0,0.0,MacOS_Safari,Other_3.0
3,cyOsumxkpNEvKIdc,4.0,0.0,0.0,0.0,0.0,2.0,2.666667,0.05,0.14,...,Returning,0,0,1.999999,0.0,0.0,0.0,1.333333,iOS_Chrome,Europe_4.0
4,F4rTMKOqtoPKzxw9,2.0,0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,...,Returning,1,0,0.2,0.0,0.0,0.0,62.749994,iOS_Internet Explorer,North America_4.0


### **Preprocessing for Model Training**

In [9]:
X = df.drop(TARGET, axis=1)
y = df[TARGET]

In [10]:
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

print(f"\nNumerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")


Numerical Features: ['AdClicks', 'InfoSectionCount', 'InfoSectionTime', 'HelpPageVisits', 'HelpPageTime', 'ItemBrowseCount', 'ItemBrowseTime', 'ExitRateFirstPage', 'SessionExitRatio', 'PageEngagementScore', 'HolidayProximityIndex', 'TrafficSourceCode', 'IsWeekendVisit', 'AdClicksPerItemBrowse', 'InfoTimePerInfoSection', 'HelpTimePerHelpVisit', 'EngagementPerItemBrowse', 'AvgItemBrowseTime']
Categorical Features: ['SessionID', 'VisitMonth', 'UserPlatformID', 'WebClientCode', 'MarketZone', 'UserCategory', 'Platform_Client', 'Market_TrafficSource']


In [11]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features),('cat', categorical_transformer, categorical_features)],remainder='passthrough' )

print("\nPreprocessing pipeline created (scaling numerical, one-hot encoding categorical).")


Preprocessing pipeline created (scaling numerical, one-hot encoding categorical).


In [13]:
X_processed = preprocessor.fit_transform(X)

In [14]:
try:
    feature_names = preprocessor.get_feature_names_out()

    print(f"\nDIAGNOSIS: Length of feature_names from preprocessor.get_feature_names_out(): {len(feature_names)}")
    print(f"DIAGNOSIS: Type of feature_names: {type(feature_names)}")
    if len(feature_names) > 0:
        print(f"DIAGNOSIS: First 5 feature_names: {feature_names[:5]}")
    else:
        print("DIAGNOSIS: feature_names is empty.")
    print(f"DIAGNOSIS: Shape of X_processed (right before DataFrame conversion): {X_processed.shape}")

    if hasattr(X_processed, 'toarray'):
        X_processed_dense = X_processed.toarray()
    else:
        X_processed_dense = X_processed

    X_processed_df = pd.DataFrame(X_processed_dense, columns=feature_names)
    print("\nProcessed data with feature names:")
    display(X_processed_df.head())
    print("\nShape of processed data:", X_processed_df.shape)
except Exception as e:
    print(f"\nCould not get feature names or convert to DataFrame: {e}")
    print("Processed data is in sparse matrix or numpy array format. Shape:", X_processed.shape)


DIAGNOSIS: Length of feature_names from preprocessor.get_feature_names_out(): 12576
DIAGNOSIS: Type of feature_names: <class 'numpy.ndarray'>
DIAGNOSIS: First 5 feature_names: ['num__AdClicks' 'num__InfoSectionCount' 'num__InfoSectionTime'
 'num__HelpPageVisits' 'num__HelpPageTime']
DIAGNOSIS: Shape of X_processed (right before DataFrame conversion): (12330, 12576)

Processed data with feature names:


Unnamed: 0,num__AdClicks,num__InfoSectionCount,num__InfoSectionTime,num__HelpPageVisits,num__HelpPageTime,num__ItemBrowseCount,num__ItemBrowseTime,num__ExitRateFirstPage,num__SessionExitRatio,num__PageEngagementScore,...,cat__Market_TrafficSource_South America_14.0,cat__Market_TrafficSource_South America_15.0,cat__Market_TrafficSource_South America_2.0,cat__Market_TrafficSource_South America_20.0,cat__Market_TrafficSource_South America_3.0,cat__Market_TrafficSource_South America_4.0,cat__Market_TrafficSource_South America_5.0,cat__Market_TrafficSource_South America_6.0,cat__Market_TrafficSource_South America_8.0,cat__Market_TrafficSource_South America_9.0
0,-1.455321,-0.707489,-0.449735,-0.399292,-0.241179,-0.692355,-0.622817,3.67999,3.259038,-0.322017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.464871,-0.707489,-0.449735,-0.399292,-0.241179,-0.669406,-0.588828,-0.460268,1.181495,-0.322017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.455321,-0.707489,-0.449735,-0.399292,-0.241179,-0.692355,-0.622817,3.67999,3.259038,-0.322017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.464871,-0.707489,-0.449735,-0.399292,-0.241179,-0.669406,-0.6214,0.574796,2.012512,-0.322017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004775,-0.707489,-0.449735,-0.399292,-0.241179,-0.485814,-0.289569,-0.046243,0.142724,-0.322017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Shape of processed data: (12330, 12576)


In [15]:
import joblib

In [16]:
joblib.dump(preprocessor, '..\Models\preprocessor.pkl')
print("\nPreprocessor saved to 'preprocessor.pkl'.")


Preprocessor saved to 'preprocessor.pkl'.


In [17]:
if hasattr(X_processed, 'toarray'):
    X_final = X_processed.toarray()
else:
    X_final = X_processed


In [18]:
if 'X_processed_df' in locals(): 
    final_df_for_training = X_processed_df.copy()
    final_df_for_training[TARGET] = y.values 
    final_df_for_training.to_csv('..\Data\data_final_dataset_for_training.csv', index=False)
    print("Final processed dataset (including target) saved to 'data_final_dataset_for_training.csv'.")

else: 
    feature_names_list = [f'feature_{i}' for i in range(X_final.shape[1])]
    final_df_for_training = pd.DataFrame(X_final, columns=feature_names_list)
    final_df_for_training[TARGET] = y.values
    final_df_for_training.to_csv('..\Data\data_final_dataset_for_training.csv', index=False)
    print("Final processed dataset (including target) saved to 'data_final_dataset_for_training.csv' (features named generically).")

Final processed dataset (including target) saved to 'data_final_dataset_for_training.csv'.
