### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading and describing data

In [2]:
user_profile_df = pd.read_csv('user_profile.csv')
user_profile_df.columns = user_profile_df.columns.str.strip()

In [3]:
user_profile_df.describe()

Unnamed: 0,userid,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level
count,1061768.0,1061768.0,1061768.0,1061768.0,1061768.0,485851.0,1061768.0,1061768.0,716848.0
mean,571310.4,15.12998,5.542412,1.644445,3.410641,1.75985,2.716854,0.05623074,2.516379
std,329502.1,25.35503,3.178516,0.4786814,1.227458,0.581303,0.5861036,0.2303669,0.928334
min,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
25%,286245.8,0.0,3.0,1.0,3.0,1.0,3.0,0.0,2.0
50%,571455.5,0.0,4.0,2.0,3.0,2.0,3.0,0.0,2.0
75%,854666.2,20.0,9.0,2.0,4.0,2.0,3.0,0.0,3.0
max,1141729.0,96.0,12.0,2.0,6.0,3.0,3.0,1.0,4.0


In [4]:
user_profile_df.head()

Unnamed: 0,userid,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level
0,234,0,5,2,5,,3,0,3.0
1,523,5,2,2,2,1.0,3,1,2.0
2,612,0,8,1,2,2.0,3,0,
3,1670,0,4,2,4,,1,0,
4,2545,0,10,1,4,,3,0,


In [5]:
user_profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061768 entries, 0 to 1061767
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   userid                1061768 non-null  int64  
 1   cms_segid             1061768 non-null  int64  
 2   cms_group_id          1061768 non-null  int64  
 3   final_gender_code     1061768 non-null  int64  
 4   age_level             1061768 non-null  int64  
 5   pvalue_level          485851 non-null   float64
 6   shopping_level        1061768 non-null  int64  
 7   occupation            1061768 non-null  int64  
 8   new_user_class_level  716848 non-null   float64
dtypes: float64(2), int64(7)
memory usage: 72.9 MB


In [6]:
print(user_profile_df.isnull().sum())

userid                       0
cms_segid                    0
cms_group_id                 0
final_gender_code            0
age_level                    0
pvalue_level            575917
shopping_level               0
occupation                   0
new_user_class_level    344920
dtype: int64


In [7]:
for col in user_profile_df.columns:
    if user_profile_df[col].isnull().any():
        unique_vals = user_profile_df[col].unique()
        print(f"Column: {col}")
        print(f"Unique Values (including NaN): {unique_vals}\n")

Column: pvalue_level
Unique Values (including NaN): [nan  1.  2.  3.]

Column: new_user_class_level
Unique Values (including NaN): [ 3.  2. nan  4.  1.]



In [8]:
import faiss
import numpy as np
import pandas as pd

features = ['cms_segid', 'cms_group_id', 'final_gender_code', 'age_level', 'shopping_level', 'occupation']
columns_to_fill = ['pvalue_level', 'new_user_class_level']

for col in features:
    user_profile_df[col] = pd.to_numeric(user_profile_df[col], downcast='unsigned')

for column in columns_to_fill:
    missing_df = user_profile_df[user_profile_df[column].isnull()]
    non_missing_df = user_profile_df[user_profile_df[column].notnull()]

    X = non_missing_df[features].values.astype(np.float32)
    y = non_missing_df[column].values
    X_missing = missing_df[features].values.astype(np.float32)

    index = faiss.IndexFlatL2(X.shape[1])

    index.add(X)

    batch_size = 10000
    num_batches = int(np.ceil(len(X_missing) / batch_size))
    missing_indices = missing_df.index.to_numpy()

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, len(X_missing))

        X_batch = X_missing[start:end]
        
        distances, indices = index.search(X_batch, k=1)

        predicted_vals = y[indices.flatten()]
        
        user_profile_df.loc[missing_indices[start:end], column] = predicted_vals

        print(f"Filled batch {i+1}/{num_batches} for column: {column}")

print("Missing brand values filled using FAISS with Euclidean similarity.")

Filled batch 1/58 for column: pvalue_level
Filled batch 2/58 for column: pvalue_level
Filled batch 3/58 for column: pvalue_level
Filled batch 4/58 for column: pvalue_level
Filled batch 5/58 for column: pvalue_level
Filled batch 6/58 for column: pvalue_level
Filled batch 7/58 for column: pvalue_level
Filled batch 8/58 for column: pvalue_level
Filled batch 9/58 for column: pvalue_level
Filled batch 10/58 for column: pvalue_level
Filled batch 11/58 for column: pvalue_level
Filled batch 12/58 for column: pvalue_level
Filled batch 13/58 for column: pvalue_level
Filled batch 14/58 for column: pvalue_level
Filled batch 15/58 for column: pvalue_level
Filled batch 16/58 for column: pvalue_level
Filled batch 17/58 for column: pvalue_level
Filled batch 18/58 for column: pvalue_level
Filled batch 19/58 for column: pvalue_level
Filled batch 20/58 for column: pvalue_level
Filled batch 21/58 for column: pvalue_level
Filled batch 22/58 for column: pvalue_level
Filled batch 23/58 for column: pvalue_lev

In [9]:
duplicate_count = user_profile_df.duplicated(subset='userid').sum()
print(f"Number of duplicate user IDs: {duplicate_count}")

Number of duplicate user IDs: 0


In [11]:
user_profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061768 entries, 0 to 1061767
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   userid                1061768 non-null  int64  
 1   cms_segid             1061768 non-null  uint8  
 2   cms_group_id          1061768 non-null  uint8  
 3   final_gender_code     1061768 non-null  uint8  
 4   age_level             1061768 non-null  uint8  
 5   pvalue_level          1061768 non-null  float64
 6   shopping_level        1061768 non-null  uint8  
 7   occupation            1061768 non-null  uint8  
 8   new_user_class_level  1061768 non-null  float64
dtypes: float64(2), int64(1), uint8(6)
memory usage: 30.4 MB


In [12]:
import os
os.makedirs('preprocessed', exist_ok=True)
user_profile_df.to_csv('preprocessed/user_profile_final.csv', index=False)