In [1]:
import numpy as np
import pandas as pd
import os

In [4]:
os.chdir(r'c:\\Users\\iiven\\Ai-Cursor\\Neural Network Foundation')

In [5]:
os.getcwd()

'c:\\Users\\iiven\\Ai-Cursor\\Neural Network Foundation'

In [6]:
df = pd.read_csv('Data/before_preprocess_ecommerce.csv', encoding='ISO-8859-1')

In [7]:
df.head(10)

Unnamed: 0,Description,Quantity,UnitPrice,Country
0,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,United Kingdom
1,WHITE METAL LANTERN,6,3.39,United Kingdom
2,CREAM CUPID HEARTS COAT HANGER,8,2.75,United Kingdom
3,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,United Kingdom
4,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,United Kingdom
5,SET 7 BABUSHKA NESTING BOXES,2,7.65,United Kingdom
6,GLASS STAR FROSTED T-LIGHT HOLDER,6,4.25,United Kingdom
7,HAND WARMER UNION JACK,6,1.85,United Kingdom
8,HAND WARMER RED POLKA DOT,6,1.85,United Kingdom
9,POPPY'S PLAYHOUSE BEDROOM,6,2.1,United Kingdom


In [8]:
df.shape

(444990, 4)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444990 entries, 0 to 444989
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Description  444990 non-null  object 
 1   Quantity     444990 non-null  int64  
 2   UnitPrice    444990 non-null  float64
 3   Country      444990 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 13.6+ MB


**For the preprocess we will encode Country column with LabelEncoder**<br>
**Description column will be encoded with bert and then use PCA to reduce dimensionality**<br>
**We will use StandardScaler for numeric columns**

In [10]:
# LE
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Country'] = le.fit_transform(df['Country'])

In [11]:
df.head(10)

Unnamed: 0,Description,Quantity,UnitPrice,Country
0,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,36
1,WHITE METAL LANTERN,6,3.39,36
2,CREAM CUPID HEARTS COAT HANGER,8,2.75,36
3,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,36
4,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,36
5,SET 7 BABUSHKA NESTING BOXES,2,7.65,36
6,GLASS STAR FROSTED T-LIGHT HOLDER,6,4.25,36
7,HAND WARMER UNION JACK,6,1.85,36
8,HAND WARMER RED POLKA DOT,6,1.85,36
9,POPPY'S PLAYHOUSE BEDROOM,6,2.1,36


In [13]:
# Bert model
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
description = df['Description'].astype(str).to_list()

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [16]:
model.eval()
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [17]:
from tqdm import tqdm

def encode_batch_with_progress(texts, batch_size=64):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=50)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu())

    return torch.cat(all_embeddings).numpy()



In [18]:
embedded_list = encode_batch_with_progress(description, batch_size=64)

100%|██████████| 6953/6953 [06:08<00:00, 18.88it/s]


In [19]:
embedded_list

array([[ 0.51657015, -0.22835104, -0.21700491, ...,  0.0260112 ,
        -0.22481956, -0.40088144],
       [ 0.11888622,  0.08073001, -0.3610544 , ...,  0.04127744,
         0.06188593,  0.01513435],
       [-0.01617578, -0.37784174,  0.3249077 , ..., -0.08035561,
        -0.1364067 , -0.15564895],
       ...,
       [ 0.11265852, -0.04419464,  0.04499143, ..., -0.10868813,
         0.09193849, -0.44006786],
       [ 0.25742558,  0.00796375,  0.04746373, ...,  0.05476691,
         0.09787294, -0.24127717],
       [ 0.10225587,  0.11473785,  0.04573229, ...,  0.01495004,
        -0.08081717, -0.26455718]], dtype=float32)

In [20]:
embedded_list.shape

(444990, 768)

In [21]:
# Use PCA
from sklearn.decomposition import PCA

pca = PCA(n_components= 20) # we get 20 features for more accuracy
embedded_pca = pca.fit_transform(embedded_list)

In [23]:
embedded_pca.shape

(444990, 20)

In [24]:
df

Unnamed: 0,Description,Quantity,UnitPrice,Country
0,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,36
1,WHITE METAL LANTERN,6,3.39,36
2,CREAM CUPID HEARTS COAT HANGER,8,2.75,36
3,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,36
4,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,36
...,...,...,...,...
444985,PACK OF 20 SPACEBOY NAPKINS,12,0.85,13
444986,CHILDREN'S APRON DOLLY GIRL,6,2.10,13
444987,CHILDRENS CUTLERY DOLLY GIRL,4,4.15,13
444988,CHILDRENS CUTLERY CIRCUS PARADE,4,4.15,13


In [26]:
dummy_df = df.iloc[::, 1::]

In [27]:
dummy_df.head()

Unnamed: 0,Quantity,UnitPrice,Country
0,6,2.55,36
1,6,3.39,36
2,8,2.75,36
3,6,3.39,36
4,6,3.39,36


In [28]:
dummy_df.shape

(444990, 3)

In [39]:
features = np.hstack([dummy_df, embedded_pca])

In [40]:
features.shape

(444990, 23)

In [41]:
features[0]

array([ 6.        ,  2.55      , 36.        ,  2.44625473, -1.26888585,
        2.9021225 , -1.01759648, -1.02824211, -0.35826913,  0.32811242,
        0.33394557,  1.06334341,  0.56716418,  0.11135551, -0.28405449,
       -0.43938202,  0.9702214 , -1.08495069, -0.17730421, -0.63923335,
       -0.10495362,  0.19303323, -1.25579536])

In [35]:
cols = ['Quantity', 'UnitPrice', 'Country']
pca_cols = [f'emb{i}' for i in range(embedded_pca.shape[1])]

In [36]:
pca_cols

['emb0',
 'emb1',
 'emb2',
 'emb3',
 'emb4',
 'emb5',
 'emb6',
 'emb7',
 'emb8',
 'emb9',
 'emb10',
 'emb11',
 'emb12',
 'emb13',
 'emb14',
 'emb15',
 'emb16',
 'emb17',
 'emb18',
 'emb19']

In [37]:
all_cols = cols + pca_cols

In [38]:
all_cols

['Quantity',
 'UnitPrice',
 'Country',
 'emb0',
 'emb1',
 'emb2',
 'emb3',
 'emb4',
 'emb5',
 'emb6',
 'emb7',
 'emb8',
 'emb9',
 'emb10',
 'emb11',
 'emb12',
 'emb13',
 'emb14',
 'emb15',
 'emb16',
 'emb17',
 'emb18',
 'emb19']

In [42]:
final_df =pd.DataFrame(features, columns= all_cols)

In [43]:
final_df.head()

Unnamed: 0,Quantity,UnitPrice,Country,emb0,emb1,emb2,emb3,emb4,emb5,emb6,...,emb10,emb11,emb12,emb13,emb14,emb15,emb16,emb17,emb18,emb19
0,6.0,2.55,36.0,2.446255,-1.268886,2.902122,-1.017596,-1.028242,-0.358269,0.328112,...,0.111356,-0.284054,-0.439382,0.970221,-1.084951,-0.177304,-0.639233,-0.104954,0.193033,-1.255795
1,6.0,3.39,36.0,-1.711715,-1.459729,-0.00048,-0.629437,-0.645542,-0.659305,0.647161,...,0.60476,0.847761,-0.47679,0.027666,-0.347168,-0.353656,0.145458,-0.229895,-0.062777,-0.021427
2,8.0,2.75,36.0,2.191493,-0.679481,1.445967,-1.098439,0.388798,0.647733,-0.711572,...,0.364188,-0.206854,0.665611,0.757856,0.084455,-0.038153,-0.616376,-0.491745,-0.238482,-0.551534
3,6.0,3.39,36.0,2.693123,-0.126394,-0.224744,-1.083747,0.551736,-1.058309,-0.068043,...,0.249736,-0.731614,0.043028,0.464421,-0.324716,0.071041,-0.761492,0.100827,-0.960503,0.379053
4,6.0,3.39,36.0,1.918139,1.047459,2.0025,-2.167035,-1.087993,-1.082379,-0.037862,...,-0.260188,0.09634,0.714616,0.437729,-1.08128,1.026494,-0.277326,-1.162508,0.07261,-0.20004


In [47]:
# Scaling
# Bert model output is numeric so it should be scaled
numeric_cols = final_df.columns.to_list()

In [49]:
numeric_cols.remove('Country')

In [50]:
numeric_cols

['Quantity',
 'UnitPrice',
 'emb0',
 'emb1',
 'emb2',
 'emb3',
 'emb4',
 'emb5',
 'emb6',
 'emb7',
 'emb8',
 'emb9',
 'emb10',
 'emb11',
 'emb12',
 'emb13',
 'emb14',
 'emb15',
 'emb16',
 'emb17',
 'emb18',
 'emb19']

In [51]:
categorical_cols = ['Country']

In [53]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444990 entries, 0 to 444989
Data columns (total 23 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Quantity   444990 non-null  float64
 1   UnitPrice  444990 non-null  float64
 2   Country    444990 non-null  float64
 3   emb0       444990 non-null  float64
 4   emb1       444990 non-null  float64
 5   emb2       444990 non-null  float64
 6   emb3       444990 non-null  float64
 7   emb4       444990 non-null  float64
 8   emb5       444990 non-null  float64
 9   emb6       444990 non-null  float64
 10  emb7       444990 non-null  float64
 11  emb8       444990 non-null  float64
 12  emb9       444990 non-null  float64
 13  emb10      444990 non-null  float64
 14  emb11      444990 non-null  float64
 15  emb12      444990 non-null  float64
 16  emb13      444990 non-null  float64
 17  emb14      444990 non-null  float64
 18  emb15      444990 non-null  float64
 19  emb16      444990 non-n

In [54]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = scaler.fit_transform(final_df[numeric_cols])

In [57]:
scaled_data[0]

array([ 0.24272483, -0.11862311,  1.71280969, -1.06700755,  2.89169766,
       -1.05542581, -1.12850387, -0.41920934,  0.4083983 ,  0.44041187,
        1.43850097,  0.80693322,  0.16606525, -0.42886681, -0.69067936,
        1.61031307, -1.84191682, -0.30563838, -1.12374179, -0.18997978,
        0.35185478, -2.35155316])

In [58]:
df_tosave = pd.DataFrame(scaled_data, columns= numeric_cols)

In [59]:
final_df['Country']

0         36.0
1         36.0
2         36.0
3         36.0
4         36.0
          ... 
444985    13.0
444986    13.0
444987    13.0
444988    13.0
444989    13.0
Name: Country, Length: 444990, dtype: float64

In [60]:
df_tosave['Country'] = final_df['Country']

In [61]:
df_tosave.head(10)

Unnamed: 0,Quantity,UnitPrice,emb0,emb1,emb2,emb3,emb4,emb5,emb6,emb7,...,emb11,emb12,emb13,emb14,emb15,emb16,emb17,emb18,emb19,Country
0,0.242725,-0.118623,1.71281,-1.067008,2.891698,-1.055426,-1.128504,-0.419209,0.408398,0.440412,...,-0.428867,-0.690679,1.610313,-1.841917,-0.305638,-1.123742,-0.18998,0.351855,-2.351553,36.0
1,0.242725,0.292864,-1.198519,-1.227486,-0.000488,-0.652827,-0.708485,-0.771444,0.805523,0.032484,...,1.279992,-0.749484,0.045945,-0.589374,-0.609605,0.25571,-0.416097,-0.114442,-0.040132,36.0
2,0.688081,-0.02065,1.534431,-0.57138,1.440769,-1.139276,0.426717,0.757892,-0.885712,0.16242,...,-0.312307,1.046334,1.257849,0.143399,-0.065791,-1.08356,-0.889989,-0.434721,-1.032786,36.0
3,0.242725,0.292864,1.885662,-0.106293,-0.223947,-1.124037,0.605544,-1.238311,-0.084702,1.891655,...,-1.104609,0.067654,0.770832,-0.551257,0.122419,-1.338668,0.182438,-1.750836,0.709788,36.0
4,0.242725,0.292864,1.343033,0.880794,1.995304,-2.247623,-1.194081,-1.266475,-0.047135,0.325198,...,0.145468,1.123368,0.72653,-1.835686,1.769276,-0.487526,-2.103928,0.132345,-0.374595,36.0
5,-0.647988,2.379692,0.186485,0.477215,-1.387327,-0.242445,-0.89524,1.500597,0.76994,-0.375102,...,-0.49034,-0.322395,-0.153238,0.247256,-0.065661,0.168769,-1.128352,1.706881,1.062559,36.0
6,0.242725,0.714149,2.13222,-0.919169,1.136328,-0.45129,-0.727663,-1.12617,0.219682,-0.317141,...,-0.252433,-1.23627,2.170033,-0.859691,-1.087482,-0.56992,-0.030914,0.566545,-1.630281,36.0
7,0.242725,-0.461529,-0.812353,-0.667874,0.244909,0.303006,-0.586969,-0.915503,0.379603,-0.97525,...,-1.212506,1.586158,0.58197,-0.953412,-0.330302,-0.784985,1.889364,-0.289768,0.981482,36.0
8,0.242725,-0.461529,0.235456,-0.413604,1.816073,-0.08079,-0.746808,-0.41767,1.068172,0.169493,...,-0.808625,0.946225,-1.169244,-1.023302,-0.06329,0.298043,2.916536,0.306784,0.035101,36.0
9,0.242725,-0.339063,-0.260821,0.6107,-1.01171,-0.907832,1.137559,-1.468032,-0.418001,-0.204932,...,0.621028,-2.378349,-1.797456,-2.561571,2.541256,-1.393126,-0.638115,0.590511,-1.895433,36.0


In [62]:
df_tosave.to_csv('Data/Scaled_Data.csv', index= False)

In [65]:
# Saving scaler and encoder
import pickle

with open('Data/Scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('Data/encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

with open('Data/pca.pkl', 'wb') as f:
    pickle.dump(pca, f)