# Lab | Cleaning numerical data

For this lab, we will be using the dataset in the Customer Analysis Business Case. This dataset can be found in files_for_lab folder.

### 1. Import the necessary libraries.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

### 2. Load the we_fn_use_c_marketing_customer_value_analysis.csv into the variable customer_df.

In [2]:
customer_df = pd.read_csv('/Users/elissadejong/lab-cleaning-numerical-data/files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv')
customer_df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


### 3. First look at its main features (head, shape, info).

In [3]:
customer_df.shape

(9134, 24)

In [4]:
customer_df.info

<bound method DataFrame.info of      Customer       State  Customer Lifetime Value Response  Coverage  \
0     BU79786  Washington              2763.519279       No     Basic   
1     QZ44356     Arizona              6979.535903       No  Extended   
2     AI49188      Nevada             12887.431650       No   Premium   
3     WW63253  California              7645.861827       No     Basic   
4     HB64268  Washington              2813.692575       No     Basic   
...       ...         ...                      ...      ...       ...   
9129  LA72316  California             23405.987980       No     Basic   
9130  PK87824  California              3096.511217      Yes  Extended   
9131  TD14365  California              8163.890428       No  Extended   
9132  UP19263  California              7524.442436       No  Extended   
9133  Y167826  California              2611.836866       No  Extended   

     Education Effective To Date EmploymentStatus Gender  Income  ...  \
0     Bachelor    

In [5]:
customer_df.dtypes

Customer                          object
State                             object
Customer Lifetime Value          float64
Response                          object
Coverage                          object
Education                         object
Effective To Date                 object
EmploymentStatus                  object
Gender                            object
Income                             int64
Location Code                     object
Marital Status                    object
Monthly Premium Auto               int64
Months Since Last Claim            int64
Months Since Policy Inception      int64
Number of Open Complaints          int64
Number of Policies                 int64
Policy Type                       object
Policy                            object
Renew Offer Type                  object
Sales Channel                     object
Total Claim Amount               float64
Vehicle Class                     object
Vehicle Size                      object
dtype: object

In [6]:
customer_df.describe()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
count,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0
mean,8004.940475,37657.380009,93.219291,15.097,48.064594,0.384388,2.96617,434.088794
std,6870.967608,30379.904734,34.407967,10.073257,27.905991,0.910384,2.390182,290.500092
min,1898.007675,0.0,61.0,0.0,0.0,0.0,1.0,0.099007
25%,3994.251794,0.0,68.0,6.0,24.0,0.0,1.0,272.258244
50%,5780.182197,33889.5,83.0,14.0,48.0,0.0,2.0,383.945434
75%,8962.167041,62320.0,109.0,23.0,71.0,0.0,4.0,547.514839
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0,2893.239678


In [7]:
customer_df = customer_df.set_index('Customer')

### 4. Rename the columns so they follow the PE8 (snake case).

In [8]:
cols = []

for i in range(len(customer_df.columns)):
    cols.append(customer_df.columns[i].lower().replace(' ','_'))

customer_df.columns = cols

### 5. Change effective to date column to datetime format.

In [9]:
customer_df['effective_to_date'] = pd.to_datetime(customer_df['effective_to_date'])

### 6. Check NaN values per column.

In [10]:
customer_df.isna().sum()/len(customer_df)*100

state                            0.0
customer_lifetime_value          0.0
response                         0.0
coverage                         0.0
education                        0.0
effective_to_date                0.0
employmentstatus                 0.0
gender                           0.0
income                           0.0
location_code                    0.0
marital_status                   0.0
monthly_premium_auto             0.0
months_since_last_claim          0.0
months_since_policy_inception    0.0
number_of_open_complaints        0.0
number_of_policies               0.0
policy_type                      0.0
policy                           0.0
renew_offer_type                 0.0
sales_channel                    0.0
total_claim_amount               0.0
vehicle_class                    0.0
vehicle_size                     0.0
dtype: float64

In [11]:
# or like in class
nulls = pd.DataFrame(customer_df.isna().sum()*100/len(customer_df), columns=['percentage'])
nulls.sort_values('percentage', ascending = False)

Unnamed: 0,percentage
state,0.0
months_since_last_claim,0.0
vehicle_class,0.0
total_claim_amount,0.0
sales_channel,0.0
renew_offer_type,0.0
policy,0.0
policy_type,0.0
number_of_policies,0.0
number_of_open_complaints,0.0


### 7. Define a function that differentiates between continuous and discrete variables. Hint: A number of unique values might be useful. Store continuous data into a continuous variable and do the same for discrete and categorical.

In [12]:
for col in customer_df:
    print(customer_df[col].value_counts().count(), '\n')

5 

8041 

2 

3 

5 

59 

5 

2 

5694 

3 

3 

202 

36 

100 

6 

9 

3 

9 

4 

4 

5106 

6 

3 



In [13]:
# def type_variables(customer_df):
continuous = pd.DataFrame()
discrete = pd.DataFrame()
categorical = pd.DataFrame()
cols = customer_df.columns
for col in cols:
    if customer_df[col].value_counts().count() <= 10:
        categorical = categorical.append(customer_df[col])
    elif customer_df[col].value_counts().count() < 10 < 1000:
        discrete = discrete.append(customer_df[col])
    elif customer_df[col].value_counts().count() >= 1000:
        continuous = continuous.append(customer_df[col])
        
# not sure if its a very good function, but either way;
# how would i be able to apply the function 'type_variables'? 
# now im just not using the function because i cannot figure out to what to apply it

In [14]:
continuous.head()

Unnamed: 0,AA10041,AA11235,AA16582,AA30683,AA34092,AA35519,AA56476,AA69265,AA71604,AA93585,...,ZZ49347,ZZ54454,ZZ62245,ZZ68210,ZZ69693,ZZ77357,ZZ83340,ZZ89380,ZZ91716,ZZ97035
customer_lifetime_value,7901.744238,2568.843396,24127.50402,6595.101921,28799.95415,8002.308333,5595.389905,3964.730745,11986.59212,7083.642205,...,2592.437797,4589.940596,6612.220871,30745.96199,2449.580048,4468.2209,5508.315536,3196.578455,3256.766388,20714.94043
income,0.0,11167.0,14072.0,0.0,33635.0,0.0,74454.0,60817.0,87560.0,97024.0,...,72421.0,32510.0,0.0,0.0,0.0,0.0,96021.0,62815.0,0.0,0.0
total_claim_amount,1258.327804,350.4,511.2,847.717402,1152.0,513.6,340.8,494.4,470.4,686.815162,...,312.0,571.2,669.399277,1004.755092,496.8,468.0,315.004602,47.061469,491.755368,2027.724442


In [15]:
# how do i swap columns and rows again?
# preferably, how do i do this inside my for loop so it doesnt get mixed at all?
# axis=1? or index=True? or both?

### 8. For the categorical data, check if there is some kind of text in a variable so we would need to clean it. Hint: Use the same method you used in step 7. Depending on the implementation, decide what to do with the variables you get.

In [16]:
# categorical.value_counts()

### 9. Get categorical features.

In [17]:
categorical_feat = pd.DataFrame(customer_df.select_dtypes(np.object))
categorical_feat.describe()

Unnamed: 0,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
count,9134,9134,9134,9134,9134,9134,9134,9134,9134,9134,9134,9134,9134,9134
unique,5,2,3,5,5,2,3,3,3,9,4,4,6,3
top,California,No,Basic,Bachelor,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Four-Door Car,Medsize
freq,3150,7826,5568,2748,5698,4658,5779,5298,6788,3426,3752,3477,4621,6424


### 10. What should we do with the customer_id column?

In [18]:
# The column customer_id should be set as an index (already done under question 3)