# Imports

In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import RandomOverSampler


pd.set_option('display.float_format', lambda x: '%.2f' % x)
RSEED = 42

# Import Data

In [132]:
invoice_train = pd.read_csv('data/train/invoice_train.csv')
client_train = pd.read_csv('data/train/client_train.csv')


  invoice_train = pd.read_csv('data/train/invoice_train.csv')


In [133]:
client_train.head()

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,target
0,60,train_Client_0,11,101,31/12/1994,0.0
1,69,train_Client_1,11,107,29/05/2002,0.0
2,62,train_Client_10,11,301,13/03/1986,0.0
3,69,train_Client_100,11,105,11/07/1996,0.0
4,62,train_Client_1000,11,303,14/10/2014,0.0


In [134]:
invoice_train.head()

Unnamed: 0,client_id,invoice_date,tarif_type,counter_number,counter_statue,counter_code,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type
0,train_Client_0,2014-03-24,11,1335667,0,203,8,1,82,0,0,0,14302,14384,4,ELEC
1,train_Client_0,2013-03-29,11,1335667,0,203,6,1,1200,184,0,0,12294,13678,4,ELEC
2,train_Client_0,2015-03-23,11,1335667,0,203,8,1,123,0,0,0,14624,14747,4,ELEC
3,train_Client_0,2015-07-13,11,1335667,0,207,8,1,102,0,0,0,14747,14849,4,ELEC
4,train_Client_0,2016-11-17,11,1335667,0,207,9,1,572,0,0,0,15066,15638,12,ELEC


## EDA


### Drop Outliers

In [135]:
# Drop outliers high month
invoice_train= invoice_train[invoice_train['months_number']<=36]

### Calculate over Columns

In [136]:
# encode counter_type
d={"ELEC":0,"GAZ":1}
invoice_train['counter_type']=invoice_train['counter_type'].map(d)

## Merge tables

In [137]:
merged_df = client_train.merge(invoice_train, how= "left", on='client_id')
merged_df.head()

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,target,invoice_date,tarif_type,counter_number,counter_statue,...,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type
0,60,train_Client_0,11,101,31/12/1994,0.0,2014-03-24,11.0,1335667.0,0,...,8.0,1.0,82.0,0.0,0.0,0.0,14302.0,14384.0,4.0,0.0
1,60,train_Client_0,11,101,31/12/1994,0.0,2013-03-29,11.0,1335667.0,0,...,6.0,1.0,1200.0,184.0,0.0,0.0,12294.0,13678.0,4.0,0.0
2,60,train_Client_0,11,101,31/12/1994,0.0,2015-03-23,11.0,1335667.0,0,...,8.0,1.0,123.0,0.0,0.0,0.0,14624.0,14747.0,4.0,0.0
3,60,train_Client_0,11,101,31/12/1994,0.0,2015-07-13,11.0,1335667.0,0,...,8.0,1.0,102.0,0.0,0.0,0.0,14747.0,14849.0,4.0,0.0
4,60,train_Client_0,11,101,31/12/1994,0.0,2016-11-17,11.0,1335667.0,0,...,9.0,1.0,572.0,0.0,0.0,0.0,15066.0,15638.0,12.0,0.0


## Train-Test-Split

In [138]:
y= merged_df['target']
X = merged_df.drop('target', axis = 1)

In [139]:
# test train split only with client-data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RSEED, stratify=y)

In [140]:
X_train.head()

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,invoice_date,tarif_type,counter_number,counter_statue,counter_code,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type
1090543,63,train_Client_129653,11,311,31/03/2008,2013-11-02,11.0,1203752.0,0,203.0,6.0,1.0,1200.0,10.0,0.0,0.0,17346.0,18556.0,4.0,0.0
923866,62,train_Client_125100,11,303,03/10/1991,2007-01-18,40.0,29069.0,0,5.0,6.0,1.0,470.0,0.0,0.0,0.0,13262.0,13732.0,4.0,1.0
3577751,62,train_Client_75652,11,307,25/09/2002,2009-08-06,11.0,646915.0,0,203.0,6.0,1.0,460.0,0.0,0.0,0.0,14329.0,14789.0,4.0,0.0
617377,69,train_Client_116688,11,103,05/12/2006,2007-01-17,11.0,1067929.0,1,207.0,6.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0
3117610,60,train_Client_63112,11,101,12/12/1978,2019-01-31,11.0,299341.0,0,410.0,9.0,1.0,2400.0,176.0,0.0,0.0,14223.0,16799.0,12.0,0.0


### Calculate Over Rows

In [141]:
# Transactions Counts
transaction_counts_df = X_train.groupby('client_id').size().reset_index(name='transactions_count'.format('1'))
transaction_counts_df
X_train = X_train.merge(transaction_counts_df, how='left', on='client_id' )
X_train.sort_values('client_id').head()

Unnamed: 0,disrict,client_id,client_catg,region,creation_date,invoice_date,tarif_type,counter_number,counter_statue,counter_code,...,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type,transactions_count
2338913,60,train_Client_0,11,101,31/12/1994,2013-07-22,11.0,1335667.0,0,203.0,...,1.0,147.0,0.0,0.0,0.0,13983.0,14130.0,4.0,0.0,24
202079,60,train_Client_0,11,101,31/12/1994,2009-07-27,11.0,1335667.0,0,203.0,...,1.0,312.0,0.0,0.0,0.0,5906.0,6218.0,4.0,0.0,24
1728348,60,train_Client_0,11,101,31/12/1994,2008-01-04,11.0,1335667.0,0,203.0,...,1.0,277.0,0.0,0.0,0.0,4969.0,5246.0,4.0,0.0,24
369318,60,train_Client_0,11,101,31/12/1994,2013-03-29,11.0,1335667.0,0,203.0,...,1.0,1200.0,184.0,0.0,0.0,12294.0,13678.0,4.0,0.0,24
535189,60,train_Client_0,11,101,31/12/1994,2012-11-29,11.0,1335667.0,0,203.0,...,1.0,886.0,0.0,0.0,0.0,11408.0,12294.0,4.0,0.0,24


In [None]:
# Cosummation per Time  

# invoice_train['invoice_date'] = pd.to_datetime(invoice_train['invoice_date'])
# # Calculate the Consumption Amount:
# invoice_train['consommation_amount'] = invoice_train['consommation_level_1'] + invoice_train['consommation_level_2'] + invoice_train['consommation_level_3'] + invoice_train['consommation_level_4']
# # Calculate the Time Period Between Invoices:
# invoice_train = invoice_train.sort_values(by=['client_id', 'counter_type', 'invoice_date'])
# invoice_train['invoice_period'] = invoice_train.groupby(['client_id', 'counter_type'])['invoice_date'].diff()



In [None]:
# most frequent reading_remarque (Maria)

In [None]:
# mean counter_coefficient (Maria)

In [None]:
# mean month_numbers by client (Jesus)
# 1. for each client: calculate mean month number
# 2. create a new column "month_number_mean"


Columns which we can't aggregate to one single client:
- invoice_date
- tarif
- counter_number
- counter_statue
- counter_code

In [None]:
# Check how many values, if we could aggregate specific features to one single client
# grouped = invoice_train.groupby('client_id')
# test = grouped['counter_type'].nunique()
# test.unique()
# counter_counts_df = invoice_train.groupby('client_id').size().reset_index(name='transactions_count'.format('1'))


In [None]:
# drop columns we dont aggregate

In [None]:
# delete duplicates

### Random Oversampler


In [None]:
# handling the imbalanced
ros = RandomOverSampler(random_state=RSEED)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
sns.countplot(x=y_train)