In [1]:
import numpy as np
import pandas as pd

### Load the data

In [4]:
data = pd.read_csv("../data/autoinsurance_churn_clean.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041065 entries, 0 to 1041064
Data columns (total 16 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   curr_ann_amt         1041065 non-null  float64
 1   days_tenure          1041065 non-null  float64
 2   cust_orig_date       1041065 non-null  object 
 3   age_in_years         1041065 non-null  int64  
 4   date_of_birth        1041065 non-null  object 
 5   city                 1041065 non-null  object 
 6   county               1041065 non-null  object 
 7   income               1041065 non-null  float64
 8   has_children         1041065 non-null  float64
 9   length_of_residence  1041065 non-null  float64
 10  marital_status       1041065 non-null  object 
 11  home_market_value    1041065 non-null  object 
 12  home_owner           1041065 non-null  float64
 13  college_degree       1041065 non-null  float64
 14  good_credit          1041065 non-null  float64
 15

In [11]:
y = data["Churn"]
X = data[[x for x in data.columns if x != "Churn"]]
X.shape

(1041065, 15)

### Encoding

In [8]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)



### Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

### PCA

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_compressed = pca.fit_transform(X_scaled)
X_compressed.shape

(1041065, 12)

### SMOTE

In [14]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_compressed, y)
pd.Series(y_res).value_counts()

Churn
0    921086
1    921086
Name: count, dtype: int64

### Splitting

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val_test, y_train, y_val_test = train_test_split(X_compressed, y, random_state=42, train_size=0.5)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, random_state=42, train_size=0.5)

In [20]:
X_train_SMOTE, X_val_test_SMOTE, y_train_SMOTE, y_val_test_SMOTE = train_test_split(X_res, y_res, 
                                                                                    random_state=42, 
                                                                                    train_size=0.5)

X_val_SMOTE, X_test_SMOTE, y_val_SMOTE, y_test_SMOTE = train_test_split(X_val_test_SMOTE, y_val_test_SMOTE, 
                                                                         random_state=42, 
                                                                         train_size=0.5)

### Save the data

In [19]:
type(X_train)

numpy.ndarray

In [21]:
regular_ndarrays = [X_train, y_train, X_val, y_val, X_test, y_test]
regular_names = ["X_train.npy", "y_train.npy", "X_val.npy", "y_val.npy", "X_test.npy", "y_test.npy"]

SMOTE_ndarrays = [X_train_SMOTE, y_train_SMOTE, X_val_SMOTE, y_val_SMOTE, X_test_SMOTE, y_test_SMOTE]
SMOTE_names = ["X_train_SMOTE.npy", "y_train_SMOTE.npy", "X_val_SMOTE.npy", "y_val_SMOTE.npy",
               "X_test_SMOTE.npy", "y_test_SMOTE.npy"]

In [23]:
for name, var in zip(regular_names, regular_ndarrays):
    np.save("../data/"+name, var)

In [24]:
for name, var in zip(SMOTE_names, SMOTE_ndarrays):
    np.save("../data/"+name, var)