In [61]:
import pandas as pd
import seaborn as sns
import csv
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
path="dataset.csv"

## DATA CLEANING

To detect automatically the delimiter or separator, it is specified that separator is equal to None to make python find it. See the explanation of sep parameter on:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [9]:
dataset=pd.read_csv(path, sep=None, engine="python")

In [10]:
dataset.head()

Unnamed: 0,rev_Mean,mou_Mean,totmrc_Mean,da_Mean,ovrmou_Mean,ovrrev_Mean,vceovr_Mean,datovr_Mean,roam_Mean,change_mou,...,forgntvl,ethnic,kid0_2,kid3_5,kid6_10,kid11_15,kid16_17,creditcd,eqpdays,Customer_ID
0,239975,21925,225,2475,0,0,0,0,0,-15725,...,0.0,N,U,U,U,U,U,Y,361.0,1000001
1,574925,48275,37425,2475,2275,91,91,0,0,53225,...,0.0,Z,U,U,U,U,U,Y,240.0,1000002
2,1699,1025,1699,0,0,0,0,0,0,-425,...,0.0,N,U,Y,U,U,U,Y,1504.0,1000003
3,38,75,38,0,0,0,0,0,0,-15,...,0.0,U,Y,U,U,U,U,Y,1812.0,1000004
4,5523,5705,7198,0,0,0,0,0,0,385,...,0.0,I,U,U,U,U,U,Y,434.0,1000005


Now, It is necessary to analyse if there is any NaN value. For see the number of NaN values for each attribute is necessary to extend the maximum number of rows that pandas let to show. It is used "with" to do it only for this code block.

In [11]:
print("Dataset size:", dataset.shape)

#Modifying the maximum number of rows that pandas let to show
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
   print(dataset.isnull().sum())

Dataset size: (100000, 100)
rev_Mean              357
mou_Mean              357
totmrc_Mean           357
da_Mean               357
ovrmou_Mean           357
ovrrev_Mean           357
vceovr_Mean           357
datovr_Mean           357
roam_Mean             357
change_mou            891
change_rev            891
drop_vce_Mean           0
drop_dat_Mean           0
blck_vce_Mean           0
blck_dat_Mean           0
unan_vce_Mean           0
unan_dat_Mean           0
plcd_vce_Mean           0
plcd_dat_Mean           0
recv_vce_Mean           0
recv_sms_Mean           0
comp_vce_Mean           0
comp_dat_Mean           0
custcare_Mean           0
ccrndmou_Mean           0
cc_mou_Mean             0
inonemin_Mean           0
threeway_Mean           0
mou_cvce_Mean           0
mou_cdat_Mean           0
mou_rvce_Mean           0
owylis_vce_Mean         0
mouowylisv_Mean         0
iwylis_vce_Mean         0
mouiwylisv_Mean         0
peak_vce_Mean           0
peak_dat_Mean           0
mou_peav_M

### DROPS

Attributes dropped:
- ethnic: is not ethical to keep this attribute
- numbcars: almost half of the dataset doesn't have this attribute
- Customer_ID: it has not variance because is unique for each customer
- ownrent, lor, dwlltype, HHstatin, dwllsize: They are NaN on 30% of the dataset

In [12]:
datasetClean=dataset.drop(columns=["ethnic", "numbcars", "Customer_ID", "ownrent", "lor", "dwlltype", "HHstatin", "dwllsize"])
print("Dataset size:", datasetClean.shape)

Dataset size: (100000, 92)


2 test cases:
- Dropping all the samples with NaN values.
- Data insertion to substitute NaN values. Categorical attributes will use mode value and numerical median value.

### DROPPING NAN'S


In [13]:
#dropping all the samples with NaN
dfDropNan=datasetClean.dropna()
dfDropNan.shape

(60385, 92)

### INSERTION OF NUMBERS

In order to code categorical attributes, object attributes need to be transform.

In [14]:
datasetClean.select_dtypes(include='object').columns

Index(['rev_Mean', 'mou_Mean', 'totmrc_Mean', 'da_Mean', 'ovrmou_Mean',
       'ovrrev_Mean', 'vceovr_Mean', 'datovr_Mean', 'roam_Mean', 'change_mou',
       'change_rev', 'drop_vce_Mean', 'drop_dat_Mean', 'blck_vce_Mean',
       'blck_dat_Mean', 'unan_vce_Mean', 'unan_dat_Mean', 'plcd_vce_Mean',
       'plcd_dat_Mean', 'recv_vce_Mean', 'recv_sms_Mean', 'comp_vce_Mean',
       'comp_dat_Mean', 'custcare_Mean', 'ccrndmou_Mean', 'cc_mou_Mean',
       'inonemin_Mean', 'threeway_Mean', 'mou_cvce_Mean', 'mou_cdat_Mean',
       'mou_rvce_Mean', 'owylis_vce_Mean', 'mouowylisv_Mean',
       'iwylis_vce_Mean', 'mouiwylisv_Mean', 'peak_vce_Mean', 'peak_dat_Mean',
       'mou_peav_Mean', 'mou_pead_Mean', 'opk_vce_Mean', 'opk_dat_Mean',
       'mou_opkv_Mean', 'mou_opkd_Mean', 'drop_blk_Mean', 'attempt_Mean',
       'complete_Mean', 'callfwdv_Mean', 'callwait_Mean', 'new_cell',
       'crclscod', 'asl_flag', 'totmou', 'totrev', 'adjrev', 'adjmou',
       'avgrev', 'avgmou', 'avgqty', 'prizm_social

Object attributes which are numbers with "," are pass to floats and object with only numbers and more than 2 different numbers are convert to integer.

In [15]:
objectAttributesList = datasetClean.select_dtypes(include='object').columns
#print(objectAttributesList)
for column in objectAttributesList:
   # Looking through all the column to see if there are one string with a number

   if datasetClean[column].str.contains('\d,').any():
      # Casting column from string with "," to float with "."
      datasetClean[column] = datasetClean[column].str.replace(',', '.').astype(float)

   elif datasetClean[column].str.contains('\d+').all() and len(datasetClean[column].unique()) > 2:
      #If the values in the column are numbers and there are not binary, they are not categorical
      datasetClean[column] = datasetClean[column].astype(int)

Now, only remains attributes which are categorical, binaries and labels

In [16]:
datasetClean.select_dtypes(include='object').columns

Index(['new_cell', 'crclscod', 'asl_flag', 'prizm_social_one', 'area',
       'dualband', 'refurb_new', 'hnd_webcap', 'marital', 'infobase', 'kid0_2',
       'kid3_5', 'kid6_10', 'kid11_15', 'kid16_17', 'creditcd'],
      dtype='object')

In [57]:
dfNanSubstituted=datasetClean.copy()

#filling NaN in object categorical attributes
columnCategorical=dfNanSubstituted.select_dtypes(include='object').columns
for column in columnCategorical:
   #Filling with the most common categorical value, the mode
   dfNanSubstituted[column].fillna(dfNanSubstituted[column].mode()[0], inplace=True)

#fillin NaN in not numerical object attributes
notNumericalColumn=dfNanSubstituted.select_dtypes(exclude='object').columns
for column in notNumericalColumn:
   #Filling with the most common categorical value, the mode
   dfNanSubstituted[column].fillna(dfNanSubstituted[column].median(), inplace=True)

#Modifying the maximum number of rows that pandas let to show
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
   print(dfNanSubstituted.isnull().sum())

rev_Mean            0
mou_Mean            0
totmrc_Mean         0
da_Mean             0
ovrmou_Mean         0
ovrrev_Mean         0
vceovr_Mean         0
datovr_Mean         0
roam_Mean           0
change_mou          0
change_rev          0
drop_vce_Mean       0
drop_dat_Mean       0
blck_vce_Mean       0
blck_dat_Mean       0
unan_vce_Mean       0
unan_dat_Mean       0
plcd_vce_Mean       0
plcd_dat_Mean       0
recv_vce_Mean       0
recv_sms_Mean       0
comp_vce_Mean       0
comp_dat_Mean       0
custcare_Mean       0
ccrndmou_Mean       0
cc_mou_Mean         0
inonemin_Mean       0
threeway_Mean       0
mou_cvce_Mean       0
mou_cdat_Mean       0
mou_rvce_Mean       0
owylis_vce_Mean     0
mouowylisv_Mean     0
iwylis_vce_Mean     0
mouiwylisv_Mean     0
peak_vce_Mean       0
peak_dat_Mean       0
mou_peav_Mean       0
mou_pead_Mean       0
opk_vce_Mean        0
opk_dat_Mean        0
mou_opkv_Mean       0
mou_opkd_Mean       0
drop_blk_Mean       0
attempt_Mean        0
complete_M

Attributes object that remains are categorical. It is printed the name of the attribute and the number of unique values in each attribute. Also, categorical attributes are encoded.

In [58]:
objectAttributesList = dfNanSubstituted.select_dtypes(include='object').columns

for column in objectAttributesList:

   print(column, len(dfNanSubstituted[column].unique()))

   #categorical
   if len(dfNanSubstituted[column].unique()) > 2:
      oneshot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
      cadena=dfNanSubstituted[column].to_numpy().reshape(-1, 1)
      dataNewColumns=oneshot_encoder.fit_transform(cadena)

      #creation of a list with new names for new coded column
      #The for loop is iterating inside the categories created by the encoder
      categories=oneshot_encoder.categories_[0]
      newColumnNameList=[f'{column}_{cat}' for cat in categories]

      newColumns=pd.DataFrame(dataNewColumns, columns=newColumnNameList)

      #adding new columns and dropping categorical column that has been encoded
      dfNanSubstituted=dfNanSubstituted.join(newColumns)
      dfNanSubstituted.drop(columns=column, inplace=True)


   else:
      #binary categorical encode
      encoder = LabelEncoder()
      dfNanSubstituted[column]=encoder.fit_transform(dfNanSubstituted[column])


new_cell U 3
crclscod A 54
asl_flag N 2
prizm_social_one S 5
area NORTHWEST/ROCKY MOUNTAIN AREA 19
dualband Y 4
refurb_new N 2
hnd_webcap WCMB 3
marital S 5
infobase M 2
kid0_2 U 2
kid3_5 U 2
kid6_10 U 2
kid11_15 U 2
kid16_17 U 2
creditcd Y 2


## ATTRIBUTES SELECTION

### PCA

It is necessary to standardize the dataset to use PCA.

In [60]:
#getting the objective attribute from the datasets of the 2 test cases
nanSubs_X=dfNanSubstituted.drop(columns='churn')
nanSubs_Y=dfNanSubstituted['churn']

nanDrop_X=dfDropNan.drop(columns='churn')
nanDrop_Y=dfDropNan['churn']


In [62]:
scaler = StandardScaler()

nanSubs_X_std=scaler.fit_transform(nanSubs_X)

nanDrop_X_std=scaler.fit_transform(nanDrop_X)

pca = PCA(n_components=2)

nanSubs_X_pca=pca.fit_transform(nanSubs_X)

#Printing results of PCA
print(pca.explained_variance_ratio_)

nanDrop_X_pca=pca.fit_transform(nanDrop_X)

#Printing results of PCA
print(pca.explained_variance_ratio_)


ValueError: could not convert string to float: '23,9975'