# Imports

## Import de mes librairies

In [56]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

# preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Evaluate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Deploy 
import pickle


from sklearn.base import TransformerMixin, BaseEstimator

## Import de mes données

In [37]:
df = pd.read_csv("data/data.csv")

# EDA

## Data cleaning

In [38]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [39]:
df.columns

Index(['country', 'year', 'uniqueid', 'bank_account', 'location_type',
       'cellphone_access', 'household_size', 'age_of_respondent',
       'gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type'],
      dtype='object')

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


principal info : 
- 23 524 observations
- 12 features and 1 target
- 3 numerical features (3 int) and 10 object type (9 categorical)

# Preprocessing

Fonctions

In [82]:
def one_hot_encode(data, cols_to_encode):
    # create and fit one hot encoder
    encoder = OneHotEncoder(dtype = int)
    features_1hot = encoder.fit_transform(data[cols_to_encode])

    # encode data and convert to dataframe
    features_1hot_df = pd.DataFrame(features_1hot.toarray())

    # set column names
    features_1hot_df.columns = encoder.get_feature_names_out(cols_to_encode)

    # drop the original columns
    df_enc = pd.concat([data.drop(cols_to_encode, axis = 1), 
                        features_1hot_df], axis = 1)
    return df_enc

In [91]:
def binary_encode(data, cols_to_encode):
    # make a copy of the original data
    df_enc = data.copy()

    # create a mapping for binary encoding
    binary_map = {"Yes": 1, "No": 0, "Female": 0, "Male": 1, "Rural": 0, "Urban": 1}

    # iterate through columns to encode and apply mapping
    for col in cols_to_encode:
        if col in binary_map:
            df_enc[col] = df_enc[col].map(binary_map)

    return df_enc

Split the data into train and test 

In [83]:
X = df.drop(["bank_account"], axis = 1)
y = df["bank_account"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Liste of my columns by type 

In [92]:
column_to_drop = []

num_columns = ["year", "household_size", "age_of_respondent"]
cat_columns = ["country","relationship_with_head", "marital_status", "education_level", "job_type"]
binary_columns = ["bank_account", "location_type", "cellphone_access", "gender_of_respondent"]

Numerical pipeline

In [105]:
scaler = StandardScaler()
binary_col = LabelEncoder()
cat_col = OneHotEncoder()

In [115]:
preprocessor = ColumnTransformer([
    ('binary_transformer', binary_col, ["bank_account", "location_type", "cellphone_access", "gender_of_respondent"]),
    ('cat_transformer', cat_col, ["country","relationship_with_head", "marital_status", "education_level", "job_type"])
#     ('num_transormer', scaler, ["household_size", "age_of_respondent"])
])

preprocessor

In [117]:
pd.DataFrame(preprocessor.fit_transform(df)).head(3)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

# Modeling 

In [113]:
logistic_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('logistic_regression', LogisticRegression())
])

Train test Pipeline with holdout method

In [114]:
X = df.drop(["bank_account"], axis = 1)
y = df["bank_account"]

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train Pipeline
logistic_pipe_trained = logistic_pipe.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe