# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [127]:
import pandas as pd
import numpy as np
from datetime import datetime

---
## 2. Load data

In [45]:
df = pd.read_csv('./clean_data_after_eda.csv')
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')

In [46]:
df.head(3)

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,mean_3m_price_p1_var,mean_3m_price_p2_var,mean_3m_price_p3_var,mean_3m_price_p1_fix,mean_3m_price_p2_fix,mean_3m_price_p3_fix,mean_3m_price_p1,mean_3m_price_p2,mean_3m_price_p3,churn
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,0.131756,0.092638,0.036909,42.497907,12.218665,8.145777,42.629663,12.311304,8.182687,1
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,0.1476,0.0,0.0,44.44471,0.0,0.0,44.59231,0.0,0.0,0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,0.167798,0.088409,0.0,44.44471,0.0,0.0,44.612508,0.088409,0.0,0


---

## 3. Feature engineering

### Difference between prices in December and preceding January

We will calculate the difference between the different prices in December and January. 

In [47]:
price_df = pd.read_csv('price_data.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head()

Unnamed: 0,id,price_date,price_off_peak_var,price_peak_var,price_mid_peak_var,price_off_peak_fix,price_peak_fix,price_mid_peak_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0
3,038af19179925da21a25619c5a24b745,2015-04-01,0.149626,0.0,0.0,44.266931,0.0,0.0
4,038af19179925da21a25619c5a24b745,2015-05-01,0.149626,0.0,0.0,44.266931,0.0,0.0


In [50]:
# Group off-peak prices by companies and month
# Get january and december prices
jan_prices = price_df.groupby('id').first().reset_index()
dec_prices = price_df.groupby('id').last().reset_index()

rename_cols = {
    'price_off_peak_var' : 'dec_off_peak_var',
    'price_off_peak_fix' : 'dec_off_peak_fix',
    'price_peak_var' : 'dec_peak_var',
    'price_peak_fix' : 'dec_peak_fix',
    'price_mid_peak_var' : 'dec_mid_peak_var',
    'price_mid_peak_fix' : 'dec_mid_peak_fix',
}
diff = pd.merge(dec_prices.rename(columns=rename_cols), jan_prices.drop(columns='price_date'), on='id')
# Calculate the difference for off_peak
diff['offpeak_diff_dec_january_energy'] = diff['dec_off_peak_var'] - diff['price_off_peak_var']
diff['offpeak_diff_dec_january_power'] = diff['dec_off_peak_fix'] - diff['price_off_peak_fix']
# Calculate the difference for peak
diff['peak_diff_dec_january_energy'] = diff['dec_peak_var'] - diff['price_peak_var']
diff['peak_diff_dec_january_power'] = diff['dec_peak_fix'] - diff['price_peak_fix']
# Calculate the difference for mid peak
diff['midpeak_diff_dec_january_energy'] = diff['dec_mid_peak_var'] - diff['price_mid_peak_var']
diff['midpeak_diff_dec_january_power'] = diff['dec_mid_peak_fix'] - diff['price_mid_peak_fix']

diff = diff[['id', 'offpeak_diff_dec_january_energy','offpeak_diff_dec_january_power','peak_diff_dec_january_energy','peak_diff_dec_january_power','midpeak_diff_dec_january_energy',
             'midpeak_diff_dec_january_power']]
diff.head()

Unnamed: 0,id,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power,peak_diff_dec_january_energy,peak_diff_dec_january_power,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0.0,0.0,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5,0.0,0.0,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916,-0.00512,0.097749,0.000763,0.065166
4,00114d74e963e47177db89bc70108537,-0.003994,-1e-06,0.0,0.0,0.0,0.0


In [99]:
feature_df = pd.merge(df, diff, on='id')
feature_df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,mean_3m_price_p3,churn,modif_after_activ,days_to_end,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power,peak_diff_dec_january_energy,peak_diff_dec_january_power,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,8.182687,1,False,358,0.020057,3.700961,-0.017912,-24.339581,-0.071536,-16.226389
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,0.0,0,True,365,-0.003767,0.177779,0.0,0.0,0.0,0.0
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,0.0,0,True,365,-0.00467,0.177779,0.000528,0.0,0.0,0.0
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,0.0,0,True,365,-0.004547,0.177779,0.0,0.0,0.0,0.0
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,16.365274,0,True,364,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166


### Mean prices

In [100]:
mean_prices = price_df.groupby(['id']).agg({
    'price_off_peak_var': 'mean', 
    'price_peak_var': 'mean', 
    'price_mid_peak_var': 'mean',
    'price_off_peak_fix': 'mean',
    'price_peak_fix': 'mean',
    'price_mid_peak_fix': 'mean'    
}).reset_index()
# Calculate the mean difference between consecutive periods
mean_prices['off_peak_peak_var_mean_diff'] = mean_prices['price_off_peak_var'] - mean_prices['price_peak_var']
mean_prices['peak_mid_peak_var_mean_diff'] = mean_prices['price_peak_var'] - mean_prices['price_mid_peak_var']
mean_prices['off_peak_mid_peak_var_mean_diff'] = mean_prices['price_off_peak_var'] - mean_prices['price_mid_peak_var']
mean_prices['off_peak_peak_fix_mean_diff'] = mean_prices['price_off_peak_fix'] - mean_prices['price_peak_fix']
mean_prices['peak_mid_peak_fix_mean_diff'] = mean_prices['price_peak_fix'] - mean_prices['price_mid_peak_fix']
mean_prices['off_peak_mid_peak_fix_mean_diff'] = mean_prices['price_off_peak_fix'] - mean_prices['price_mid_peak_fix']
cols = [
    'id', 
    'off_peak_peak_var_mean_diff',
    'peak_mid_peak_var_mean_diff', 
    'off_peak_mid_peak_var_mean_diff',
    'off_peak_peak_fix_mean_diff', 
    'peak_mid_peak_fix_mean_diff', 
    'off_peak_mid_peak_fix_mean_diff'
]
feature_df = pd.merge(feature_df, mean_prices[cols], on='id')
feature_df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,peak_diff_dec_january_energy,peak_diff_dec_january_power,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power,off_peak_peak_var_mean_diff,peak_mid_peak_var_mean_diff,off_peak_mid_peak_var_mean_diff,off_peak_peak_fix_mean_diff,peak_mid_peak_fix_mean_diff,off_peak_mid_peak_fix_mean_diff
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,-0.017912,-24.339581,-0.071536,-16.226389,0.024038,0.034219,0.058257,18.590255,7.45067,26.040925
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,0.0,0.0,0.0,0.0,0.142485,0.007124,0.149609,44.311375,0.0,44.311375
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,0.000528,0.0,0.0,0.0,0.08209,0.088421,0.170512,44.38545,0.0,44.38545
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,0.0,0.0,0.0,0.0,0.15121,0.0,0.15121,44.400265,0.0,44.400265
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,-0.002302,0.097749,0.003487,0.065166,0.020536,0.030773,0.051309,16.275263,8.137629,24.412893


This feature may be useful because it adds more granularity to the existing feature that my colleague found to be useful. Instead of looking at differences across an entire year, we have now created features that look at mean average price differences across different time periods (`off_peak`, `peak`, `mid_peak`). The dec-jan feature may reveal macro patterns that occur over an entire year, whereas inter-time-period features may reveal patterns on a micro scale between months.

### Max Price difference across periods

In [113]:
max_diff_across_periods_months = mean_prices.groupby(['id']).agg({
    'off_peak_peak_var_mean_diff': 'max',
    'peak_mid_peak_var_mean_diff': 'max',
    'off_peak_mid_peak_var_mean_diff': 'max',
    'off_peak_peak_fix_mean_diff': 'max',
    'peak_mid_peak_fix_mean_diff': 'max',
    'off_peak_mid_peak_fix_mean_diff': 'max'
}).reset_index().rename(
    columns={
        'off_peak_peak_var_mean_diff': 'off_peak_peak_var_max_monthly_diff',
        'peak_mid_peak_var_mean_diff': 'peak_mid_peak_var_max_monthly_diff',
        'off_peak_mid_peak_var_mean_diff': 'off_peak_mid_peak_var_max_monthly_diff',
        'off_peak_peak_fix_mean_diff': 'off_peak_peak_fix_max_monthly_diff',
        'peak_mid_peak_fix_mean_diff': 'peak_mid_peak_fix_max_monthly_diff',
        'off_peak_mid_peak_fix_mean_diff': 'off_peak_mid_peak_fix_max_monthly_diff'
    }
)
max_diff_across_periods_months.head()

Unnamed: 0,id,off_peak_peak_var_max_monthly_diff,peak_mid_peak_var_max_monthly_diff,off_peak_mid_peak_var_max_monthly_diff,off_peak_peak_fix_max_monthly_diff,peak_mid_peak_fix_max_monthly_diff,off_peak_mid_peak_fix_max_monthly_diff
0,0002203ffbb812588b632b9e628cc38d,0.020545,0.030633,0.051178,16.280694,8.140345,24.421038
1,0004351ebdd665e6ee664792efc4fd13,0.146426,0.0,0.146426,44.38545,0.0,44.38545
2,0010bcc39e42b3c2131ed2ce55246e3c,0.181558,0.0,0.181558,45.31971,0.0,45.31971
3,0010ee3855fdea87602a5b7aba8e42de,0.020465,0.02926,0.049725,16.258972,8.129484,24.388456
4,00114d74e963e47177db89bc70108537,0.147926,0.0,0.147926,44.26693,0.0,44.26693


In [114]:
columns = [
    'id',
    'off_peak_peak_var_max_monthly_diff',
    'peak_mid_peak_var_max_monthly_diff',
    'off_peak_mid_peak_var_max_monthly_diff',
    'off_peak_peak_fix_max_monthly_diff',
    'peak_mid_peak_fix_max_monthly_diff',
    'off_peak_mid_peak_fix_max_monthly_diff'
]

feature_df = pd.merge(feature_df, max_diff_across_periods_months[columns], on='id')
feature_df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,off_peak_mid_peak_var_mean_diff,off_peak_peak_fix_mean_diff,peak_mid_peak_fix_mean_diff,off_peak_mid_peak_fix_mean_diff,off_peak_peak_var_max_monthly_diff,peak_mid_peak_var_max_monthly_diff,off_peak_mid_peak_var_max_monthly_diff,off_peak_peak_fix_max_monthly_diff,peak_mid_peak_fix_max_monthly_diff,off_peak_mid_peak_fix_max_monthly_diff
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,0.058257,18.590255,7.45067,26.040925,0.024038,0.034219,0.058257,18.590255,7.45067,26.040925
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,0.149609,44.311375,0.0,44.311375,0.142485,0.007124,0.149609,44.311375,0.0,44.311375
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,0.170512,44.38545,0.0,44.38545,0.08209,0.088421,0.170512,44.38545,0.0,44.38545
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,0.15121,44.400265,0.0,44.400265,0.15121,0.0,0.15121,44.400265,0.0,44.400265
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,0.051309,16.275263,8.137629,24.412893,0.020536,0.030773,0.051309,16.275263,8.137629,24.412893


I thought that calculating the maximum price change between months and time periods would be a good feature to create because I was trying to think from the perspective of a PowerCo client. As a Utilities customer, there is nothing more annoying than sudden price changes between months, and a large increase in prices within a short time span would be an influencing factor in causing me to look at other utilities providers for a better deal. Since we are trying to predict churn for this use case, I thought this would be an interesting feature to include.

### Did the client change their product after they activated the contract

In [115]:

feature_df['modif_after_activ'] = feature_df['date_activ'] == feature_df['date_modif_prod']

### Tenure
How long a company has been a client of PowerCo.

In [125]:
feature_df['tenure'] = ((feature_df['date_end'] - feature_df['date_activ'])/ np.timedelta64(1, 'Y')).astype(int)

### Transforming the dates into month
- months_activ = Number of months active until reference date (Jan 2016)
- months_to_end = Number of months of the contract left until reference date (Jan 2016)
- months_modif_prod = Number of months since last modification until reference date (Jan 2016)
- months_renewal = Number of months since last renewal until reference date (Jan 2016)

In [126]:
def convert_months(reference_date, df, column):
    """
    Input a column with timedeltas and return months
    """
    time_delta = reference_date - df[column]
    months = (time_delta / np.timedelta64(1, 'M')).astype(int)
    return months

In [129]:
# Create reference date
reference_date = datetime(2016, 1, 1)

# Create columns
feature_df['months_activ'] = convert_months(reference_date, feature_df, 'date_activ')
feature_df['months_to_end'] = -convert_months(reference_date, feature_df, 'date_end')
feature_df['months_modif_prod'] = convert_months(reference_date, feature_df, 'date_modif_prod')
feature_df['months_renewal'] = convert_months(reference_date, feature_df, 'date_renewal')

Using intuition, you could assume that a client who has been an active client of PowerCo for a longer amount of time may have more loyalty to the brand and is more likely to stay. Whereas a newer client may be more volatile. Hence the addition of the `months_activ` feature.

As well as this, if we think from the perspective of a client with PowerCo, if you're coming toward the end of your contract with PowerCo your thoughts could go a few ways. You could be looking for better deals for when your contract ends, or you might want to see out your contract and sign another one. One the other hand if you've only just joined, you may have a period where you're allowed to leave if you're not satisfied. Furthermore, if you're in the middle of your contract, their may be charges if you wanted to leave, deterring clients from churning mid-way through their agreement. So, I think `months_to_end` will be an interesting feature because it may reveal patterns and behaviours about timing of churn.

My belief is that if a client has made recent updates to their contract, they are more likely to be satisfied or at least they have received a level of customer service to update or change their existing services. I believe this to be a positive sign, they are an engaged customer, and so I believe `months_modif_prod` will be an interesting feature to include because it shows the degree of how 'engaged' a client is with PowerCo.

Finally the number of months since a client last renewed a contract I believe will be an interesting feature because once again, it shows the degree to which that client is engaged. It also goes a step further than just engagement, it shows a level of commitment if a client renews their contract. For this reason, I believe `months_renewal` will be a good feature to include.

### The difference between the day the contract is supposed to be renewed and the registered date of contract end

In [137]:
feature_df['days_to_end'] = convert_months(feature_df['date_end'], feature_df, 'date_renewal')

### Transforming the skewed data

In [130]:
skewed = [
    'cons_12m', 
    'cons_gas_12m', 
    'cons_last_month',
    'forecast_cons_12m', 
    'forecast_cons_year', 
    'forecast_discount_energy',
    'forecast_meter_rent_12m', 
    'forecast_price_energy_off_peak',
    'forecast_price_energy_peak', 
    'forecast_price_pow_off_peak'
]

feature_df[skewed].describe()

Unnamed: 0,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak
count,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0
mean,159230.3,28091.08,16091.371448,1868.638618,1399.858747,0.96645,63.090448,0.137282,0.050488,43.130085
std,573483.6,162978.6,64366.262314,2387.651549,3247.876793,5.108355,66.166636,0.024623,0.049037,4.48614
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5674.0,0.0,0.0,494.98,0.0,0.0,16.18,0.11634,0.0,40.606701
50%,14116.0,0.0,793.0,1112.61,314.0,0.0,18.8,0.143166,0.084138,44.311378
75%,40764.0,0.0,3383.0,2402.27,1746.0,0.0,131.03,0.146348,0.098837,44.311378
max,6207104.0,4154590.0,771203.0,82902.83,175375.0,30.0,599.31,0.273963,0.195975,59.266378


We can see that the standard deviation for most of these features is quite high.

In [131]:
feature_df["cons_12m"] = np.log10(feature_df["cons_12m"] + 1)
feature_df["cons_gas_12m"] = np.log10(feature_df["cons_gas_12m"] + 1)
feature_df["cons_last_month"] = np.log10(feature_df["cons_last_month"] + 1)
feature_df["forecast_cons_12m"] = np.log10(feature_df["forecast_cons_12m"] + 1)
feature_df["forecast_cons_year"] = np.log10(feature_df["forecast_cons_year"] + 1)
feature_df["forecast_meter_rent_12m"] = np.log10(feature_df["forecast_meter_rent_12m"] + 1)
feature_df["imp_cons"] = np.log10(feature_df["imp_cons"] + 1)
feature_df[skewed].describe()

Unnamed: 0,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak
count,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0,14605.0
mean,4.223945,0.778978,2.264801,2.962162,1.784733,0.96645,1.517233,0.137282,0.050488,43.130085
std,0.884545,1.716828,1.769266,0.683612,1.584972,5.108355,0.571489,0.024623,0.049037,4.48614
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.753966,0.0,0.0,2.695464,0.0,0.0,1.235023,0.11634,0.0,40.606701
50%,4.149742,0.0,2.899821,3.046733,2.498311,0.0,1.296665,0.143166,0.084138,44.311378
75%,4.610287,0.0,3.52943,3.380803,3.242293,0.0,2.120673,0.146348,0.098837,44.311378
max,6.792889,6.618528,5.887169,4.918575,5.24397,30.0,2.778376,0.273963,0.195975,59.266378


In [95]:
cols = list(df.columns[:25])
cols.append('modif_after_activ')
cols.append('days_to_end')
cols.append('churn')
feature_df = pd.merge(diff, df[cols], on='id')
feature_df.head()

Unnamed: 0,id,offpeak_diff_dec_january_energy,offpeak_diff_dec_january_power,peak_diff_dec_january_energy,peak_diff_dec_january_power,midpeak_diff_dec_january_energy,midpeak_diff_dec_january_power,channel_sales,cons_12m,cons_gas_12m,...,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,modif_after_activ,days_to_end,churn
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,-0.002302,0.097749,0.003487,0.065166,foosdfpfkusacimwkcsosbicdxkicaua,22034,0,...,43.08,43.08,1,81.42,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,17.25,True,361,0
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0.0,0.0,0.0,0.0,MISSING,4060,0,...,24.42,24.42,1,61.58,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,False,364,0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5,0.0,0.0,0.0,0.0,usilxuppasemubllopkaafesmlibmsdf,7440,0,...,38.58,38.58,2,81.61,3,lxidpiddsbxsbosboudacockeimpuepw,13.856,False,434,0
3,00114d74e963e47177db89bc70108537,-0.003994,-1e-06,0.0,0.0,0.0,0.0,ewpakwlliwisiwduibdlfmalxowmwpci,11272,0,...,29.76,29.76,1,157.99,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,True,359,0
4,0013f326a839a2f6ad87a1859952d227,-0.006171,0.0,-0.002351,0.0,0.003371,0.0,foosdfpfkusacimwkcsosbicdxkicaua,267414,0,...,30.0,30.0,1,341.58,3,lxidpiddsbxsbosboudacockeimpuepw,20.0,False,361,0


In [96]:
feature_df.to_csv('cleaned_and_engineered_data.csv',index=False)