In [1]:
import pandas as pd
import numpy as np
from scipy import stats

customers = '../data/customers/customers.csv'
customers_df = pd.read_csv(customers)

customers_df[10:13]

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
10,0000945f66de1a11d9447609b8b41b1bc987ba185a5496...,,,ACTIVE,NONE,29.0,d93e1aaecdebf9f71ab42cc0c5fdbb378514c94971ad1c...
11,000097d91384a0c14893c09ed047a963c4fc6a5c021044...,,,ACTIVE,NONE,31.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
12,00009c2aeae8761f738e4f937d9be6b49861a66339c2b1...,,,ACTIVE,NONE,49.0,7e2caa18837edc6a7ad542c4c45f5a3a59da2a6433f540...


2 features.
    - label encoding for FN / ACTIVE / CLUB / FN_FR.
    - change age for median value

In [2]:
print("club_member_status unique values : " + str(customers_df['club_member_status'].unique()))
print("fashion_news_frequency unique values : " + str(customers_df['fashion_news_frequency'].unique()))
print("Active unique values : " + str(customers_df['Active'].unique()))
print("FN unique values : " + str(customers_df['FN'].unique()))

club_member_status unique values : ['ACTIVE' nan 'PRE-CREATE' 'LEFT CLUB']
fashion_news_frequency unique values : ['NONE' 'Regularly' nan 'Monthly']
Active unique values : [nan  1.]
FN unique values : [nan  1.]


In [3]:
#i want to change values NONE for 0 in columns FN and ACTIVE -> also changes type to boolean
customers_df['Active'] = customers_df['Active'].notna().astype(int)
customers_df['FN'] = customers_df['FN'].notna().astype(int)

#i am going to change all null values for -1 
customers_df['club_member_status'].fillna(-1, inplace=True)
customers_df['fashion_news_frequency'].fillna(-1, inplace=True)


In [4]:
customers_df.dtypes

customer_id                object
FN                          int32
Active                      int32
club_member_status         object
fashion_news_frequency     object
age                       float64
postal_code                object
dtype: object

In [5]:
print("club_member_status unique values : " + str(customers_df['club_member_status'].unique()))
print("fashion_news_frequency unique values : " + str(customers_df['fashion_news_frequency'].unique()))
print("Active unique values : " + str(customers_df['Active'].unique()))
print("FN unique values : " + str(customers_df['FN'].unique()))

club_member_status unique values : ['ACTIVE' -1 'PRE-CREATE' 'LEFT CLUB']
fashion_news_frequency unique values : ['NONE' 'Regularly' -1 'Monthly']
Active unique values : [0 1]
FN unique values : [0 1]


In [6]:
#step2 : i want to go a bit further and code all posible values as ints, so not to have strings

customers_df['club_member_status'] = customers_df['club_member_status'].replace(-1, 0)
customers_df['club_member_status'] = customers_df['club_member_status'].replace('ACTIVE', 1)
customers_df['club_member_status'] = customers_df['club_member_status'].replace('PRE-CREATE', 2)
customers_df['club_member_status'] = customers_df['club_member_status'].replace('LEFT CLUB', 3)

customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace(-1, 0)
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace('NONE', 1)
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace('Regularly', 2)
customers_df['fashion_news_frequency'] = customers_df['fashion_news_frequency'].replace('Monthly', 3)


print("club_member_status unique values : " + str(customers_df['club_member_status'].unique()))
print("fashion_news_frequency unique values : " + str(customers_df['fashion_news_frequency'].unique()))
print("Active unique values : " + str(customers_df['Active'].unique()))
print("FN unique values : " + str(customers_df['FN'].unique()))

club_member_status unique values : [1 0 2 3]
fashion_news_frequency unique values : [1 2 0 3]
Active unique values : [0 1]
FN unique values : [0 1]


In [7]:
#step 3. Make all fields into same string and call it "INFO-CODE" and delete previous fields

customers_df['INFO-CODE'] = customers_df['FN'].astype(str) + ' - ' + customers_df['fashion_news_frequency'].astype(str) + ' - ' + customers_df['Active'].astype(str) + ' - ' + customers_df['club_member_status'].astype(str)

# columns_to_delete = ['FN', 'fashion_news_frequency', 'Active' , 'club_member_status']
# customers_df.drop(columns=columns_to_delete, inplace=True)

In [8]:
customers_df

Unnamed: 0,customer_id,age,postal_code,INFO-CODE
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,0 - 1 - 0 - 1
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,0 - 1 - 0 - 1
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,0 - 1 - 0 - 1
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,0 - 1 - 0 - 1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,1 - 2 - 1 - 1
...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...,0 - 1 - 0 - 1
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...,0 - 1 - 0 - 1
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...,1 - 2 - 1 - 1
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...,1 - 2 - 1 - 1


In [9]:
# Calculate the median of the 'age' column (excluding NaN values)
median_age = customers_df['age'].median()

# Replace NaN values in the 'age' column with the median value
customers_df['age'].fillna(median_age, inplace=True)

In [10]:
# Calculate the Z-scores for the 'age' column
z_scores = np.abs(stats.zscore(customers_df['age']))

# Define a threshold beyond which values are considered outliers
threshold = 3  # You can adjust this threshold as needed

# Replace age values that are beyond the threshold with the median
customers_df['age'] = np.where(z_scores > threshold, customers_df['age'].median(), customers_df['age'])

In [11]:
customers_df

Unnamed: 0,customer_id,age,postal_code,INFO-CODE
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,0 - 1 - 0 - 1
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,0 - 1 - 0 - 1
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,0 - 1 - 0 - 1
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,0 - 1 - 0 - 1
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,1 - 2 - 1 - 1
...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...,0 - 1 - 0 - 1
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...,0 - 1 - 0 - 1
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...,1 - 2 - 1 - 1
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...,1 - 2 - 1 - 1


In [12]:
#write customers_df to csv
file_path = '../data/customers/customers_features.csv'

# Write the DataFrame to a CSV file
customers_df.to_csv(file_path, index=False)