## Importing modules

In [1]:
import pandas as pd
import numpy as np

## Importing Data

In [2]:
file1 = pd.read_csv("Data/file1.csv")
file2 = pd.read_csv("Data/file2.csv")
file3 = pd.read_csv("Data/file3.csv")

# Before concatenaiting the data I will change the heading of file3 to be the same as in file1 and 2
file3.rename(columns={"State":"ST", "Gender": "GENDER"}, inplace=True)

ca_df = pd.concat([file1,file2,file3]).reset_index(drop=True)

In [3]:
ca_df

Unnamed: 0,Customer,ST,GENDER,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
12069,LA72316,California,M,Bachelor,23405.98798,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
12070,PK87824,California,F,College,3096.511217,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
12071,TD14365,California,M,Bachelor,8163.890428,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
12072,UP19263,California,M,College,7524.442436,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


## Standardizing Headings

In [4]:
headings = ["customer", "state", "gender", "education", "customer_lifetime_value", "income", "monthly_premium_auto", "number_open_complaints", 
            "policy_type", "vehicle_class", "total_claim_amount"]

ca_df.columns = headings

## Deleting and rearranging columns

In [5]:
ca_df.drop(columns="customer", inplace=True)

## Correcting data types

In [6]:
#If the value is a string
## strip the symbol %, convert into float and round to 0 decimals
## Else check if the valu is Nan
### if yes return value
### if not round to 0 decimals
ca_df.customer_lifetime_value = ca_df.customer_lifetime_value.apply(lambda x: np.round(float(x.strip("%"))) if type(x)==str else np.round(x) if x==x else x)

In [7]:
# If the value is a string of the form n/n/n
## Split it into a list, convert it into array of int and apply sum()
## Else return the value
ca_df.number_open_complaints = ca_df.number_open_complaints.apply(lambda x: np.array(x.split("/"), dtype=int)[1] if type(x)==str else x)

In [8]:
#check if all the numeric values are type float64

ca_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12074 entries, 0 to 12073
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   state                    9137 non-null   object 
 1   gender                   9015 non-null   object 
 2   education                9137 non-null   object 
 3   customer_lifetime_value  9130 non-null   float64
 4   income                   9137 non-null   float64
 5   monthly_premium_auto     9137 non-null   float64
 6   number_open_complaints   9137 non-null   float64
 7   policy_type              9137 non-null   object 
 8   vehicle_class            9137 non-null   object 
 9   total_claim_amount       9137 non-null   float64
dtypes: float64(5), object(5)
memory usage: 943.4+ KB


## Filtering data and Correcting typos

In [9]:
# Cleaning of the column state. First we Define function clean_state() and then we map it to the column

def clean_state(x):

    return "California" if x=="Cali" else "Arizona" if x=="AZ" else "Washington" if x=="WA" else x

# map it to column ca_df.state
ca_df.state = list(map(clean_state, ca_df.state))


In [10]:
ca_df.state.value_counts()

California    3152
Oregon        2601
Arizona       1704
Nevada         882
Washington     798
Name: state, dtype: int64

In [11]:
# Cleaning of column gender, this time with map and a lambda function
ca_df.gender = list(map(lambda x: "F" if x in ["female", "Femal"] else "M" if x=="Male" else x, ca_df.gender))

In [12]:
ca_df.gender.value_counts()

F    4607
M    4408
Name: gender, dtype: int64

## Removing Duplicates

In [13]:
ca_df.drop_duplicates(inplace=True)
ca_df

Unnamed: 0,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_open_complaints,policy_type,vehicle_class,total_claim_amount
0,Washington,,Master,,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697954.0,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.0,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.0,0.0,106.0,0.0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536308.0,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
12069,California,M,Bachelor,23406.0,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.234764
12070,California,F,College,3097.0,21604.0,79.0,0.0,Corporate Auto,Four-Door Car,379.200000
12071,California,M,Bachelor,8164.0,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.784983
12072,California,M,College,7524.0,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.200000


## Replacing null values with mean

In [14]:
# Function to calculate the mean of a column and replace the Nan values with it

def replace_by_mean(df, column_list):

    remaining_nan = {}

    for column in column_list:

        mean = np.mean(df[column])
        df[column] = df[column].fillna(mean)

# Function to store in dictionary the number of nan values per column

def nan_counter(df):

    remaining_nan = {}

    for column in df.columns:

        remaining_nan[column] = df[column][df[column].isna() == True].size

    return remaining_nan


column_list = ["customer_lifetime_value", "income", "monthly_premium_auto", "number_open_complaints", "total_claim_amount"]

# nan values in numeric columns replaced by the mean in numeric columns
replace_by_mean(ca_df, column_list)

# convert float to int where appropiate

for column in column_list:

    ca_df[column] = list(map(lambda x: int(round(x)), ca_df[column]))


# For the rest. Nan value rows dropped in those column with only one occurrence. In the gender column Nan values replace by "U" (for Unknown)
ca_df.gender = list(map(lambda x: "U" if x != x else x, ca_df.gender))
ca_df.dropna(inplace=True)

print(nan_counter(ca_df))

{'state': 0, 'gender': 0, 'education': 0, 'customer_lifetime_value': 0, 'income': 0, 'monthly_premium_auto': 0, 'number_open_complaints': 0, 'policy_type': 0, 'vehicle_class': 0, 'total_claim_amount': 0}


### The income columns has more than 2,000 values=0. Does that make sense?

In [15]:
ca_df.income[ca_df.income == 0].size

2256

### It coluld make sense. It is young people that doesn´t have a job yet. The insurance is paid by the parents.

In [16]:
ca_df.education.value_counts()

Bachelor                2652
College                 2593
High School or Below    2549
Master                   727
Doctor                   330
Bachelors                 24
Name: education, dtype: int64

### 561 people with education = Doctor have income = 0. The total of people with Doctor education is 3630. Not sure if the zeros are intendes or if it is missing data

In [17]:
print(ca_df[(ca_df.income == 0) & (ca_df.education == "Doctor")].size)
print(ca_df[(ca_df.education == "Doctor")].size)

510
3300


## Bucketing the data

In [18]:
ca_df.insert(1, "region", list(map(lambda x: "west" if x=="California" else "east" if x=="Washington" else "north" if x=="Oregon" else "central", ca_df.state)))

## Standardizing the data

In [19]:
ca_df_standarized = pd.DataFrame(list(map(lambda x: list(map(lambda y: y.lower() if type(y)==str else y, x)), ca_df.values)))
ca_df_standarized.columns = ca_df.columns


In [20]:
ca_df_standarized

Unnamed: 0,state,region,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_open_complaints,policy_type,vehicle_class,total_claim_amount
0,washington,east,u,master,185590,0,1000,0,personal auto,four-door car,3
1,arizona,central,f,bachelor,697954,0,94,0,personal auto,four-door car,1131
2,nevada,central,f,bachelor,1288743,48767,108,0,personal auto,two-door car,566
3,california,west,m,bachelor,764586,0,106,0,corporate auto,suv,530
4,washington,east,m,high school or below,536308,36357,68,0,personal auto,four-door car,17
...,...,...,...,...,...,...,...,...,...,...,...
8870,california,west,m,bachelor,23406,71941,73,0,personal auto,four-door car,198
8871,california,west,f,college,3097,21604,79,0,corporate auto,four-door car,379
8872,california,west,m,bachelor,8164,0,85,3,corporate auto,four-door car,791
8873,california,west,m,college,7524,21941,96,0,personal auto,four-door car,691
