In [1]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
train

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn,state_code,region
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,...,244.7,91,11.01,10.0,3,2.70,1,False,KS,Midwest
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,...,254.4,103,11.45,13.7,3,3.70,1,False,OH,Midwest
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,...,162.6,104,7.32,12.2,5,3.29,0,False,NJ,Northeast
3,OH,84,408,Yes,No,0,299.4,71,50.90,61.9,...,196.9,89,8.86,6.6,7,1.78,2,False,OH,Midwest
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,...,186.9,121,8.41,10.1,3,2.73,3,False,OK,South
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,SC,79,415,No,No,0,134.7,98,22.90,189.7,...,221.4,128,9.96,11.8,5,3.19,2,False,SC,South
2662,AZ,192,415,No,Yes,36,156.2,77,26.55,215.5,...,279.1,83,12.56,9.9,6,2.67,2,False,AZ,West
2663,WV,68,415,No,No,0,231.1,57,39.29,153.4,...,191.3,123,8.61,9.6,4,2.59,3,False,WV,South
2664,RI,28,510,No,No,0,180.8,109,30.74,288.8,...,191.9,91,8.64,14.1,6,3.81,2,False,RI,Northeast


In [4]:
train = train.drop(["state", "state_code"], axis=1)
test = test.drop(["state", "state_code"], axis = 1)

In [5]:
train["international_plan"] = train["international_plan"].astype("category")
train["voice_mail_plan"] = train["voice_mail_plan"].astype("category")
train["area_code"] = train["area_code"].astype("category")
train["region"] = train["region"].astype("category")

In [6]:
cat_col = []
num_col = []
for col in list(train.columns):
    if is_categorical_dtype(train[col]):
        cat_col.append(col)
    elif is_numeric_dtype(train[col]):
        num_col.append(col)

In [7]:
num_col, len(num_col)

(['account_length',
  'number_vmail_messages',
  'total_day_minutes',
  'total_day_calls',
  'total_day_charge',
  'total_eve_minutes',
  'total_eve_calls',
  'total_eve_charge',
  'total_night_minutes',
  'total_night_calls',
  'total_night_charge',
  'total_intl_minutes',
  'total_intl_calls',
  'total_intl_charge',
  'customer_service_calls',
  'churn'],
 16)

In [8]:
num_col.remove("churn")

In [9]:
cat_col

['area_code', 'international_plan', 'voice_mail_plan', 'region']

In [10]:
train["area_code"].unique()

[415, 408, 510]
Categories (3, int64): [408, 415, 510]

In [11]:
ordi = OrdinalEncoder(categories=[["No", "Yes"],
                                  ["No", "Yes"]])
minmax = MinMaxScaler()
onehot = OneHotEncoder()

In [12]:
ct = ColumnTransformer([("ord", ordi, ["international_plan", "voice_mail_plan"]), 
                        ("onehot", onehot, ["area_code", "region"]),
                        ("minmax", minmax, num_col)], remainder="passthrough")

In [13]:
ct.fit(train)

In [14]:
train_use = ct.transform(train)
new_features = list(ct.get_feature_names_out())
for i in range(len(new_features)):
    new_features[i] = new_features[i].split("__")[1]
train_df = pd.DataFrame(train_use, columns= new_features)
train_df

Unnamed: 0,international_plan,voice_mail_plan,area_code_408,area_code_415,area_code_510,region_Midwest,region_Northeast,region_South,region_West,account_length,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.524793,...,0.582353,0.542866,0.572161,0.436090,0.572152,0.500,0.15,0.500000,0.111111,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.438017,...,0.605882,0.537690,0.599772,0.526316,0.600000,0.685,0.15,0.685185,0.111111,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.561983,...,0.647059,0.333225,0.338457,0.533835,0.338608,0.610,0.25,0.609259,0.000000,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.342975,...,0.517647,0.170171,0.436095,0.421053,0.436076,0.330,0.35,0.329630,0.222222,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.305785,...,0.717647,0.407959,0.407629,0.661654,0.407595,0.505,0.15,0.505556,0.333333,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.322314,...,0.400000,0.521514,0.505835,0.714286,0.505696,0.590,0.25,0.590741,0.222222,0.0
2662,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.789256,...,0.741176,0.592688,0.670083,0.375940,0.670253,0.495,0.30,0.494444,0.222222,0.0
2663,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.276860,...,0.323529,0.421870,0.420154,0.676692,0.420253,0.480,0.20,0.479630,0.333333,0.0
2664,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.111570,...,0.341176,0.794241,0.421862,0.436090,0.422152,0.705,0.30,0.705556,0.222222,0.0


In [15]:
test_use = ct.transform(test)
#new_features = list(ct.get_feature_names_out())
test_df = pd.DataFrame(test_use, columns=new_features)

In [16]:
test_df

Unnamed: 0,international_plan,voice_mail_plan,area_code_408,area_code_415,area_code_510,region_Midwest,region_Northeast,region_South,region_West,account_length,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.479339,...,0.470588,0.967001,0.489895,0.428571,0.489873,0.435,0.20,0.435185,0.111111,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.264463,...,0.488235,0.628276,0.469969,0.586466,0.470253,0.635,0.30,0.635185,0.444444,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.661157,...,0.570588,0.873827,0.332764,0.714286,0.332911,0.270,0.45,0.270370,0.444444,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.454545,...,0.600000,0.377548,0.415315,0.541353,0.415190,0.385,0.30,0.385185,0.222222,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.198347,...,0.641176,0.591394,0.384287,0.428571,0.384177,0.555,0.05,0.555556,0.111111,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.466942,...,0.735294,0.428017,0.580416,0.458647,0.580380,0.575,0.35,0.575926,0.222222,0.0
663,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.433884,...,0.770588,0.560660,0.528893,0.300752,0.529114,0.405,0.15,0.405556,0.111111,0.0
664,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.243802,...,0.647059,0.233905,0.473669,0.759398,0.473418,0.660,0.40,0.659259,0.333333,0.0
665,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.652893,...,0.617647,0.543513,0.426985,0.368421,0.427215,0.580,0.20,0.579630,0.111111,0.0


In [17]:
train_df.to_csv("../data/train_fe.csv", index=False)
test_df.to_csv("../data/test_fe.csv", index=False)