# EI ST4

## Imports

In [None]:
import pandas as pd
from datetime import datetime
import math

In [None]:
tf = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_train.csv")
rf = pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_test.csv")

## Cleaning up the dataframe

In [None]:
tf

In [None]:
rf

We can remove the `languageCode` and `countryCode` column as they are constant

In [None]:
tf[~tf.languageCode.isin(['fr', 'FR'])].empty and tf[~tf.countryCode.eq('FRA')].empty

In [None]:
tf = tf.drop(columns=['languageCode', 'countryCode'])
rf = rf.drop(columns=['languageCode', 'countryCode'])

We will also remove the `userId`, `registrationDate`, `reactivationValue`, `emailContactable` and `postalContactable` as they are irrelevant. `postalCode` as it will be to hard to analyse.

In [None]:
tf = tf.drop(columns=['registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable', 'postalCode'])
rf = rf.drop(columns=['registrationDate', 'reactivationValue', 'emailContactable', 'postalContactable', 'postalCode'])

We are going to replace the `washDishes` `STRING` column with an `INT` column

In [None]:
tf["washDishes"] = tf["washDishes"].apply(lambda e: 1 if e == 'Auto' else 0)

In the `MrPropreTrier`, `AntikalTrier`, `ArielTrier`, `DashTrier` we will replace `Known Trier` with `1` and `Nan` with `-1`

In [None]:
for c in ['MrPropreTrier', 'AntikalTrier', 'ArielTrier', 'DashTrier']:
    tf[c[:-5]] = tf.apply(lambda row: 1.0 if row[c] == 'Known Trier' else 0.0, axis=1)
    tf = tf.drop(columns=[c])

    rf[c[:-5]] = rf.apply(lambda row: 1.0 if row[c] == 'Known Trier' else 0.0, axis=1)
    rf = rf.drop(columns=[c])

We will replace `detergentType` by `liquid`, `pods` and `powder` 

In [None]:
# pods
tf['pods'] = tf['detergentType'].apply(lambda e: 1.0 if pd.notna(e) and 'Pods' in e else 0.0)
rf['pods'] = rf['detergentType'].apply(lambda e: 1.0 if pd.notna(e) and 'Pods' in e else 0.0)

# powder
tf['powder'] = tf['detergentType'].apply(lambda e: 1.0 if pd.notna(e) and 'Powder' in e else 0.0)
rf['powder'] = rf['detergentType'].apply(lambda e: 1.0 if pd.notna(e) and 'Powder' in e else 0.0)

# liquid
tf['liquid'] = tf['detergentType'].apply(lambda e: 1.0 if pd.notna(e) and 'Liquid' in e else 0.0)
rf['liquid'] = rf['detergentType'].apply(lambda e: 1.0 if pd.notna(e) and 'Liquid' in e else 0.0)

# removes extra column
tf = tf.drop(columns=['detergentType']) 
rf = rf.drop(columns=['detergentType']) 

Replacing `toothBrushType` with `electricToothbrush`

In [None]:
tf['electricToothbrush'] = tf['toothBrushType'].apply(lambda e: 1.0 if e == 'Electric' else 0.0)
rf['electricToothbrush'] = rf['toothBrushType'].apply(lambda e: 1.0 if e == 'Electric' else 0.0)

tf = tf.drop(columns=['toothBrushType'])
rf = rf.drop(columns=['toothBrushType'])

Replacing `petOwner` with `hasPet`, if the first 3 characters are 'Yes' or 'Oui' the value is `1`, else it is `-1`

In [None]:
def likesPets(s:str) -> float:
    if s in ['Yes, we love our furry friends', "Oui, j'adore nos petites boules de poils !"]: 
        return 1

    if s in ['Yes']: 
        return 0.9

    if s in ["Non, j'aime les animaux, mais je n'en ai pas pour le moment.", 'No, we love animals but no pets here for now']:   
        return 0.8
    
    if s in ['No - maybe future']:
        return 0.5

    if s in ['No']:
        return 0.3

    if s in ['Des animaux dans la maison ? Non merci !', "No, we'd never have pets in the house"]:
        return 0
    return 0.3

# creating a new column for animal lovers
tf['likesPets'] = tf['petOwner'].apply(likesPets)
rf['likesPets'] = rf['petOwner'].apply(likesPets)

# creating a new column for pet owners
tf['hasPet'] = tf['petOwner'].apply(lambda e: 1.0 if str(e)[:3] == 'Yes' or str(e)[:3] == 'Oui' else 0.0)
rf['hasPet'] = rf['petOwner'].apply(lambda e: 1.0 if str(e)[:3] == 'Yes' or str(e)[:3] == 'Oui' else 0.0)

tf = tf.drop(columns=['petOwner'])
rf = rf.drop(columns=['petOwner'])

Transforms a `ISO 8601` time string to the amount of days since the time string

In [None]:
def timeStringToDelta(timeString: str) -> int:
    if pd.isna(timeString): return None
    return int((datetime.utcnow().timestamp() - datetime.strptime(timeString[:10], "%Y-%m-%d").timestamp()) / 3600 / 24)

Replaces `lastActivityDate` with `daysSinceActivity`: an `integer` representing the amount of ellapsed days since last activity

In [None]:
def fillAvg(c_name: str, df1: pd.DataFrame, df2: pd.DataFrame):
    mean = (df1[c_name].mean(skipna=True) * df1.shape[0] + df2[c_name].mean(skipna=True) * df2.shape[0]) / (df1.shape[0] + df2.shape[0])
    df1[c_name] = df1[c_name].fillna(mean)
    df2[c_name] = df2[c_name].fillna(mean)

In [None]:
tf['daysSinceActivity'] = tf['lastActivityDate'].apply(timeStringToDelta)
rf['daysSinceActivity'] = rf['lastActivityDate'].apply(timeStringToDelta)

# normalizess the new column
minV = min(tf['daysSinceActivity'].min(), rf['daysSinceActivity'].min())
maxV = max(tf['daysSinceActivity'].max(), rf['daysSinceActivity'].max())

tf['daysSinceActivity'] = tf['daysSinceActivity'].apply(lambda e: (e - minV) / (maxV - minV))
rf['daysSinceActivity'] = rf['daysSinceActivity'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
fillAvg('daysSinceActivity', tf, rf)

# removes the extra column
tf = tf.drop(columns=['lastActivityDate'])
rf = rf.drop(columns=['lastActivityDate'])

Replaces `age` with a normalized column

In [None]:
# normalizess the new column
minV = min(tf['age'].min(), rf['age'].min())
maxV = max(tf['age'].max(), rf['age'].max())

tf['age'] = tf['age'].apply(lambda e: (e - minV) / (maxV - minV))
rf['age'] = rf['age'].apply(lambda e: (e - minV) / (maxV - minV))

# replaces Nan by the avg
fillAvg('age', tf, rf)

The possible values for `numberChildren` are `'1'`, `'2'`, `'3'`, `'4'`, `'4+'` and `'NaN'`. We will be creating a column `nbChildren` of type `int` where `'NaN'` will be mapped to `None`.

In [None]:
def childrenMagik(children: str) -> float:
    if children == '0': return 0
    if children == '1': return 1 / 5
    if children == '2': return 2 / 5
    if children == '3': return 3 / 5
    if children == '4': return 4 / 5
    if children == '4+': return 1
    return None

tf['nbChildren'] = tf['numberChildren'].apply(childrenMagik)
rf['nbChildren'] = rf['numberChildren'].apply(childrenMagik)

# Replacing nan with the avg
fillAvg('nbChildren', tf, rf)

tf = tf.drop(columns=['numberChildren'])
rf = rf.drop(columns=['numberChildren'])

Replaces `F` with `1` and `M` with `-1` in the `gender` column

In [None]:
tf['gender'] = tf['gender'].apply(lambda e: 1.0 if e == "F" else 0.0)
rf['gender'] = rf['gender'].apply(lambda e: 1.0 if e == "F" else 0.0)

Changes `ecoInterest` replacing `High`, `Medium`, `Low`, `Very high` with a scale going from `0` to `1`. We the replace `nan` with the avg.

In [None]:
def textScaleToFloat(s:str) -> float:
    if s == 'Very high': return 1
    if s == 'High': return 2 / 3
    if s == 'Medium': return 1 / 3
    if s == 'Low': return 0
    return None

# replaces the 'normal' values with floats
tf['ecoInterest'] = tf['ecoInterest'].apply(textScaleToFloat)
rf['ecoInterest'] = rf['ecoInterest'].apply(textScaleToFloat)

# replaces the nan with the avg
fillAvg('ecoInterest', tf, rf)

Changes `scentLover` replacing `NonUser`, `Low`, `Medium`, `High` with a scale going from 0 to 1. We the replace `nan` with the avg.

In [None]:
def textScaleToFloat(s:str) -> float:
    if s == 'High': return 1
    if s == 'Medium': return 2 / 3
    if s == 'Low': return 1 / 3
    if s == 'NonUser': return 0
    return None

# replaces the 'normal' values with floats
tf['scentLover'] = tf['scentLover'].apply(textScaleToFloat)
rf['scentLover'] = rf['scentLover'].apply(textScaleToFloat)

# replaces the nan with the avg
fillAvg('scentLover', tf, rf)

Handles the closest shop

In [None]:
def shops(df):
    df['magasin']           = df['closestShop'].apply(lambda e: 1.0 if e =='1. Magasin de Proximit�' else 0.0)
    df['moyenneSurface']    = df['closestShop'].apply(lambda e: 1.0 if e =='2. Moyenne Surface' else 0.0)
    df['superMarket']       = df['closestShop'].apply(lambda e: 1.0 if e =='3. SuperMarket' else 0.0)
    df['hyperMarket']       = df['closestShop'].apply(lambda e: 1.0 if e =='4. HyperMarket' else 0.0)
    df['drive']             = df['closestShop'].apply(lambda e: 1.0 if e =='5. Drive' else 0.0)
    df['hardDiscount']      = df['closestShop'].apply(lambda e: 1.0 if e =='6. Hard Discounter' else 0.0)

shops(tf)
shops(rf)

tf = tf.drop(columns=['closestShop'])
rf = rf.drop(columns=['closestShop'])

In [None]:
labels = tf.pop('washDishes')
tf['labels'] = labels

tf

In [None]:
rf

In [None]:
tf.to_csv('./data/train.csv', index=False)
rf.to_csv('./data/test.csv', index=False)

In [None]:
tf.columns

In [None]:
rf.columns