In [23]:
# - Aggregate data into one Data Frame using Pandas.
# - Standardizing header names
# - Deleting and rearranging columns – delete the column customer as it is only a unique identifier for each row of data
# - Working with data types – Check the data types of all the columns and fix the incorrect ones (for ex. customer lifetime value and number of complaints )
# - Filtering data and Correcting typos – Filter the data in state and gender column to standardize the texts in those columns
# - Removing duplicates
# - Replacing null values – Replace missing values with means of the column (for numerical columns)

In [24]:
import pandas as pd
import numpy as np

In [25]:
df1 = pd.read_csv('Data/file1.csv')
df2 = pd.read_csv('Data/file2.csv')
df3 = pd.read_csv('Data/file3.csv')


In [26]:
def column_name_capitalize (df):
    df.columns = df.columns.str.upper()
    return df

In [27]:
column_name_capitalize(df1)
column_name_capitalize(df2)
column_name_capitalize(df3)

Unnamed: 0,CUSTOMER,STATE,CUSTOMER LIFETIME VALUE,EDUCATION,GENDER,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,TOTAL CLAIM AMOUNT,VEHICLE CLASS
0,SA25987,Washington,3479.137523,High School or Below,M,0,104,0,Personal Auto,499.200000,Two-Door Car
1,TB86706,Arizona,2502.637401,Master,M,0,66,0,Personal Auto,3.468912,Two-Door Car
2,ZL73902,Nevada,3265.156348,Bachelor,F,25820,82,0,Personal Auto,393.600000,Four-Door Car
3,KX23516,California,4455.843406,High School or Below,F,0,121,0,Personal Auto,699.615192,SUV
4,FN77294,California,7704.958480,High School or Below,M,30366,101,2,Personal Auto,484.800000,SUV
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,23405.987980,Bachelor,M,71941,73,0,Personal Auto,198.234764,Four-Door Car
7066,PK87824,California,3096.511217,College,F,21604,79,0,Corporate Auto,379.200000,Four-Door Car
7067,TD14365,California,8163.890428,Bachelor,M,0,85,3,Corporate Auto,790.784983,Four-Door Car
7068,UP19263,California,7524.442436,College,M,21941,96,0,Personal Auto,691.200000,Four-Door Car


In [28]:
df1

Unnamed: 0,CUSTOMER,ST,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
4003,,,,,,,,,,,
4004,,,,,,,,,,,
4005,,,,,,,,,,,
4006,,,,,,,,,,,


In [29]:
df2

Unnamed: 0,CUSTOMER,ST,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,TOTAL CLAIM AMOUNT,POLICY TYPE,VEHICLE CLASS
0,GS98873,Arizona,F,Bachelor,323912.47%,16061,88,1/0/00,633.600000,Personal Auto,Four-Door Car
1,CW49887,California,F,Master,462680.11%,79487,114,1/0/00,547.200000,Special Auto,SUV
2,MY31220,California,F,College,899704.02%,54230,112,1/0/00,537.600000,Personal Auto,Two-Door Car
3,UH35128,Oregon,F,College,2580706.30%,71210,214,1/1/00,1027.200000,Personal Auto,Luxury Car
4,WH52799,Arizona,F,College,380812.21%,94903,94,1/0/00,451.200000,Corporate Auto,Two-Door Car
...,...,...,...,...,...,...,...,...,...,...,...
991,HV85198,Arizona,M,Master,847141.75%,63513,70,1/0/00,185.667213,Personal Auto,Four-Door Car
992,BS91566,Arizona,F,College,543121.91%,58161,68,1/0/00,140.747286,Corporate Auto,Four-Door Car
993,IL40123,Nevada,F,College,568964.41%,83640,70,1/0/00,471.050488,Corporate Auto,Two-Door Car
994,MY32149,California,F,Master,368672.38%,0,96,1/0/00,28.460568,Personal Auto,Two-Door Car


In [30]:
df3

Unnamed: 0,CUSTOMER,STATE,CUSTOMER LIFETIME VALUE,EDUCATION,GENDER,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,TOTAL CLAIM AMOUNT,VEHICLE CLASS
0,SA25987,Washington,3479.137523,High School or Below,M,0,104,0,Personal Auto,499.200000,Two-Door Car
1,TB86706,Arizona,2502.637401,Master,M,0,66,0,Personal Auto,3.468912,Two-Door Car
2,ZL73902,Nevada,3265.156348,Bachelor,F,25820,82,0,Personal Auto,393.600000,Four-Door Car
3,KX23516,California,4455.843406,High School or Below,F,0,121,0,Personal Auto,699.615192,SUV
4,FN77294,California,7704.958480,High School or Below,M,30366,101,2,Personal Auto,484.800000,SUV
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,23405.987980,Bachelor,M,71941,73,0,Personal Auto,198.234764,Four-Door Car
7066,PK87824,California,3096.511217,College,F,21604,79,0,Corporate Auto,379.200000,Four-Door Car
7067,TD14365,California,8163.890428,Bachelor,M,0,85,3,Corporate Auto,790.784983,Four-Door Car
7068,UP19263,California,7524.442436,College,M,21941,96,0,Personal Auto,691.200000,Four-Door Car


In [31]:
def rearrange_columns(df, oldn, newn):
    return df.rename(columns = {oldn:newn}, inplace=True)

In [32]:
rearrange_columns(df1,'ST', 'STATE')

In [33]:
rearrange_columns(df2,'ST', 'STATE')

In [34]:
rearrange_columns(df3,'ST', 'STATE')

In [35]:
def rearrange_col(df, list_of_new_col_names):
    df = df[list_of_new_col_names]
    return df

In [36]:
new_column_names = list(df1.columns)

In [37]:
df2 = rearrange_col(df2, new_column_names)


In [38]:
df3 = rearrange_col(df3, new_column_names)


In [39]:
df2

Unnamed: 0,CUSTOMER,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,GS98873,Arizona,F,Bachelor,323912.47%,16061,88,1/0/00,Personal Auto,Four-Door Car,633.600000
1,CW49887,California,F,Master,462680.11%,79487,114,1/0/00,Special Auto,SUV,547.200000
2,MY31220,California,F,College,899704.02%,54230,112,1/0/00,Personal Auto,Two-Door Car,537.600000
3,UH35128,Oregon,F,College,2580706.30%,71210,214,1/1/00,Personal Auto,Luxury Car,1027.200000
4,WH52799,Arizona,F,College,380812.21%,94903,94,1/0/00,Corporate Auto,Two-Door Car,451.200000
...,...,...,...,...,...,...,...,...,...,...,...
991,HV85198,Arizona,M,Master,847141.75%,63513,70,1/0/00,Personal Auto,Four-Door Car,185.667213
992,BS91566,Arizona,F,College,543121.91%,58161,68,1/0/00,Corporate Auto,Four-Door Car,140.747286
993,IL40123,Nevada,F,College,568964.41%,83640,70,1/0/00,Corporate Auto,Two-Door Car,471.050488
994,MY32149,California,F,Master,368672.38%,0,96,1/0/00,Personal Auto,Two-Door Car,28.460568


In [40]:
df2.head()

Unnamed: 0,CUSTOMER,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,GS98873,Arizona,F,Bachelor,323912.47%,16061,88,1/0/00,Personal Auto,Four-Door Car,633.6
1,CW49887,California,F,Master,462680.11%,79487,114,1/0/00,Special Auto,SUV,547.2
2,MY31220,California,F,College,899704.02%,54230,112,1/0/00,Personal Auto,Two-Door Car,537.6
3,UH35128,Oregon,F,College,2580706.30%,71210,214,1/1/00,Personal Auto,Luxury Car,1027.2
4,WH52799,Arizona,F,College,380812.21%,94903,94,1/0/00,Corporate Auto,Two-Door Car,451.2


In [41]:
Mu = pd.concat([df1, df2, df3])

In [42]:
Mu

Unnamed: 0,CUSTOMER,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,RB50392,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,QZ44356,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,AI49188,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,WW63253,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,GA49547,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
7065,LA72316,California,M,Bachelor,23406,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,PK87824,California,F,College,3096.51,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,TD14365,California,M,Bachelor,8163.89,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,UP19263,California,M,College,7524.44,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [43]:
def drop_col(df, column_name):
    Mu.drop(columns=[column_name], inplace=True)
    return df

In [44]:
drop_col(Mu, 'CUSTOMER')

Unnamed: 0,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,Washington,,Master,,0.0,1000.0,1/0/00,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,697953.59%,0.0,94.0,1/0/00,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,1288743.17%,48767.0,108.0,1/0/00,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,764586.18%,0.0,106.0,1/0/00,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,536307.65%,36357.0,68.0,1/0/00,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelor,23406,71941.0,73.0,0,Personal Auto,Four-Door Car,198.234764
7066,California,F,College,3096.51,21604.0,79.0,0,Corporate Auto,Four-Door Car,379.200000
7067,California,M,Bachelor,8163.89,0.0,85.0,3,Corporate Auto,Four-Door Car,790.784983
7068,California,M,College,7524.44,21941.0,96.0,0,Personal Auto,Four-Door Car,691.200000


In [45]:
Mu['CUSTOMER LIFETIME VALUE'] = Mu['CUSTOMER LIFETIME VALUE'].replace({'%':''}, regex = True)

In [46]:
Mu['CUSTOMER LIFETIME VALUE'] = pd.to_numeric(Mu['CUSTOMER LIFETIME VALUE'], errors = 'coerce')
Mu['CUSTOMER LIFETIME VALUE'] = Mu



In [47]:
Mu['CUSTOMER LIFETIME VALUE'].isna().sum()

2937

In [48]:
Mu['NUMBER OF OPEN COMPLAINTS'].str.contains('1/1/00').sum()

247

In [49]:
Mu.pivot_table(columns=['CUSTOMER LIFETIME VALUE'], aggfunc='size')

CUSTOMER LIFETIME VALUE
AZ              74
Arizona       1630
Cali           120
California    3032
Nevada         882
Oregon        2601
WA              30
Washington     768
dtype: int64

In [50]:
Mu['GENDER'].str.upper()
Mu['GENDER'].str.contains('NaN').sum()

0

In [51]:
Mu['GENDER'].unique()

array([nan, 'F', 'M', 'Femal', 'Male', 'female'], dtype=object)

In [52]:
# def replace_value(df_column,old_phrase, new_phrase ):
#     df_column.replace(old_phrase, new_phrase, regex = True)
#     return df_column

In [53]:
Mu['NUMBER OF OPEN COMPLAINTS'] = Mu['NUMBER OF OPEN COMPLAINTS'].replace('1/0/00', 0 , regex=True)
    

In [54]:
Mu['NUMBER OF OPEN COMPLAINTS'].unique()

array([0, '1/2/00', '1/1/00', '1/3/00', '1/5/00', '1/4/00', nan, 2, 3, 1,
       5, 4], dtype=object)

In [55]:
Mu['NUMBER OF OPEN COMPLAINTS'] = Mu['NUMBER OF OPEN COMPLAINTS'].replace('1/2/00', 2 , regex=True)

In [56]:
Mu['NUMBER OF OPEN COMPLAINTS'] = Mu['NUMBER OF OPEN COMPLAINTS'].replace('1/3/00', 3 , regex=True)

In [57]:
Mu['NUMBER OF OPEN COMPLAINTS'] = Mu['NUMBER OF OPEN COMPLAINTS'].replace('1/5/00', 5 , regex=True)

In [58]:
Mu['NUMBER OF OPEN COMPLAINTS'] = Mu['NUMBER OF OPEN COMPLAINTS'].replace('1/4/00', 4 , regex=True)

In [59]:
Mu['NUMBER OF OPEN COMPLAINTS'] = Mu['NUMBER OF OPEN COMPLAINTS'].replace('1/1/00', 1 , regex=True)

In [60]:
# Mu['GENDER'] = Mu['GENDER'].str.upper()

In [61]:
Mu['GENDER'].unique()

array([nan, 'F', 'M', 'Femal', 'Male', 'female'], dtype=object)

In [62]:
# Mu['GENDER']=Mu['GENDER'].fillna(value="N/A")

In [63]:
# def change_str_gender(df_column):
#     for i in Mu['GENDER']:
#         if i.startswith("N"):
#             continue
#         elif i.startswith('F'):
#             i='F'
#         else:
#             i='M'
#         return: df_column
            

In [64]:
# def change_str_gender(df_column):
#     for i in df_column:
#         if i.startswith("N"):
#             return 'N/A'
#         elif i.startswith('F'):
#             return 'F'
#         else:
#             return 'M'


In [65]:
Mu['GENDER'] = Mu['GENDER'].replace({'Femal':'F', 'female': 'F', 'Male': 'M'})

In [66]:
Mu['GENDER'].value_counts()

F    4607
M    4408
Name: GENDER, dtype: int64

In [67]:
# def change_str_country(df_column):
#     for i in df_column:
#             if i.startswith('A'):
#                 return 'Arizona'
#             elif i.startswith('W'):
#                 return 'Washington'
#             elif i.startswith('C'):
#                 return 'California'
#             elif i == "N":
#                 return 'Nevada'
#             elif i.startswith('O'):
#                 return 'Oregon'
#             else:
#                 return i
           

In [68]:
Mu['GENDER'].value_counts()

F    4607
M    4408
Name: GENDER, dtype: int64

In [69]:
#Mu['STATE'] =list(map(change_str_country, Mu['STATE'] ))

In [70]:
Mu.head()

Unnamed: 0,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,Washington,,Master,Washington,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.704934
1,Arizona,F,Bachelor,Arizona,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,F,Bachelor,Nevada,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.472247
3,California,M,Bachelor,California,0.0,106.0,0.0,Corporate Auto,SUV,529.881344
4,Washington,M,High School or Below,Washington,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.269323


In [71]:
Mu['STATE'].value_counts()

California    3032
Oregon        2601
Arizona       1630
Nevada         882
Washington     768
Cali           120
AZ              74
WA              30
Name: STATE, dtype: int64

In [72]:
Mu['STATE'] = Mu['STATE'].replace({'Cali':'California', 'AZ': 'Arizona', 'WA': 'Washington'})

In [73]:
Mu['STATE'].value_counts()

California    3152
Oregon        2601
Arizona       1704
Nevada         882
Washington     798
Name: STATE, dtype: int64

In [74]:
Mu['STATE'].value_counts(dropna=False)

California    3152
NaN           2937
Oregon        2601
Arizona       1704
Nevada         882
Washington     798
Name: STATE, dtype: int64

In [75]:
Mu['EDUCATION'] = Mu['EDUCATION'].replace({'Bachelor':'Bachelors'})

In [76]:
Mu['TOTAL CLAIM AMOUNT'] = Mu['TOTAL CLAIM AMOUNT'].round(2)

In [77]:
Mu

Unnamed: 0,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,Washington,,Master,Washington,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.70
1,Arizona,F,Bachelors,Arizona,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.46
2,Nevada,F,Bachelors,Nevada,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.47
3,California,M,Bachelors,California,0.0,106.0,0.0,Corporate Auto,SUV,529.88
4,Washington,M,High School or Below,Washington,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.27
...,...,...,...,...,...,...,...,...,...,...
7065,California,M,Bachelors,California,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.23
7066,California,F,College,California,21604.0,79.0,0.0,Corporate Auto,Four-Door Car,379.20
7067,California,M,Bachelors,California,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.78
7068,California,M,College,California,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.20


In [78]:
Mu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12074 entries, 0 to 7069
Data columns (total 10 columns):
STATE                        9137 non-null object
GENDER                       9015 non-null object
EDUCATION                    9137 non-null object
CUSTOMER LIFETIME VALUE      9137 non-null object
INCOME                       9137 non-null float64
MONTHLY PREMIUM AUTO         9137 non-null float64
NUMBER OF OPEN COMPLAINTS    9137 non-null float64
POLICY TYPE                  9137 non-null object
VEHICLE CLASS                9137 non-null object
TOTAL CLAIM AMOUNT           9137 non-null float64
dtypes: float64(4), object(6)
memory usage: 1.0+ MB


In [84]:
Mu = Mu.drop_duplicates()

In [83]:
Mu['GENDER'].isnull().sum()

3059

Unnamed: 0,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,Washington,,Master,Washington,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.70
1,Arizona,F,Bachelors,Arizona,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.46
2,Nevada,F,Bachelors,Nevada,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.47
3,California,M,Bachelors,California,0.0,106.0,0.0,Corporate Auto,SUV,529.88
4,Washington,M,High School or Below,Washington,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.27
...,...,...,...,...,...,...,...,...,...,...
7064,California,F,College,California,47761.0,104.0,0.0,Personal Auto,Four-Door Car,541.28
7065,California,M,Bachelors,California,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.23
7067,California,M,Bachelors,California,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.78
7068,California,M,College,California,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.20


In [87]:
Mu['GENDER'].value_counts(dropna=False)

F      4381
M      4163
NaN     123
Name: GENDER, dtype: int64

In [89]:
Mu['INCOME'].fillna(value=pd.mean(Mu['INCOME']), inplace=True)

NameError: name 'mean_value' is not defined

In [90]:
Mu['INCOME'].fillna(np.mean(Mu['INCOME']), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [94]:
Mu['MONTHLY PREMIUM AUTO'].fillna(np.mean(Mu['MONTHLY PREMIUM AUTO']), inplace=True)

In [None]:
Mu['MONTHLY PREMIUM AUTO']=round(Mu['MONTHLY PREMIUM AUTO']. inplace=True))

In [92]:
Mu['TOTAL CLAIM AMOUNT'].fillna(np.mean(Mu['TOTAL CLAIM AMOUNT']), inplace=True)

In [93]:
Mu['TOTAL CLAIM AMOUNT'].value_counts()

316.80    106
292.80    102
312.00    101
331.20     98
321.60     94
         ... 
99.09       1
357.64      1
810.07      1
454.36      1
933.33      1
Name: TOTAL CLAIM AMOUNT, Length: 4913, dtype: int64

In [95]:
Mu=Mu.applymap(lambda s: s.lower() if type(s)==str else s)

In [96]:
Mu


Unnamed: 0,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,washington,,master,washington,0.0,1000.0,0.0,personal auto,four-door car,2.70
1,arizona,f,bachelors,arizona,0.0,94.0,0.0,personal auto,four-door car,1131.46
2,nevada,f,bachelors,nevada,48767.0,108.0,0.0,personal auto,two-door car,566.47
3,california,m,bachelors,california,0.0,106.0,0.0,corporate auto,suv,529.88
4,washington,m,high school or below,washington,36357.0,68.0,0.0,personal auto,four-door car,17.27
...,...,...,...,...,...,...,...,...,...,...
7064,california,f,college,california,47761.0,104.0,0.0,personal auto,four-door car,541.28
7065,california,m,bachelors,california,71941.0,73.0,0.0,personal auto,four-door car,198.23
7067,california,m,bachelors,california,0.0,85.0,3.0,corporate auto,four-door car,790.78
7068,california,m,college,california,21941.0,96.0,0.0,personal auto,four-door car,691.20


In [122]:
region_dict = {'california': 'west region', 'oregon' : 'north west', 'washington': 'east', 'arizona' : 'central', 'nevada': 'central'}
Mu['REGION'] = Mu['STATE'].map(region_dict)
Mu

Unnamed: 0,REGION,REGION.1,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,east,east,washington,,master,washington,0.0,1000.0,0.0,personal auto,four-door car,2.70
1,central,central,arizona,f,bachelors,arizona,0.0,94.0,0.0,personal auto,four-door car,1131.46
2,central,central,nevada,f,bachelors,nevada,48767.0,108.0,0.0,personal auto,two-door car,566.47
3,west region,west region,california,m,bachelors,california,0.0,106.0,0.0,corporate auto,suv,529.88
4,east,east,washington,m,high school or below,washington,36357.0,68.0,0.0,personal auto,four-door car,17.27
...,...,...,...,...,...,...,...,...,...,...,...,...
7064,west region,west region,california,f,college,california,47761.0,104.0,0.0,personal auto,four-door car,541.28
7065,west region,west region,california,m,bachelors,california,71941.0,73.0,0.0,personal auto,four-door car,198.23
7067,west region,west region,california,m,bachelors,california,0.0,85.0,3.0,corporate auto,four-door car,790.78
7068,west region,west region,california,m,college,california,21941.0,96.0,0.0,personal auto,four-door car,691.20


In [123]:
Mu = Mu[['REGION', 'STATE', 'GENDER', 'EDUCATION', 'CUSTOMER LIFETIME VALUE', 'INCOME', 'MONTHLY PREMIUM AUTO','NUMBER OF OPEN COMPLAINTS', 'POLICY TYPE', 'VEHICLE CLASS', 'TOTAL CLAIM AMOUNT']]

In [124]:
Mu.columns

Index(['REGION', 'REGION', 'STATE', 'GENDER', 'EDUCATION',
       'CUSTOMER LIFETIME VALUE', 'INCOME', 'MONTHLY PREMIUM AUTO',
       'NUMBER OF OPEN COMPLAINTS', 'POLICY TYPE', 'VEHICLE CLASS',
       'TOTAL CLAIM AMOUNT'],
      dtype='object')

In [125]:
Mu = Mu[['REGION', 'STATE', 'GENDER', 'EDUCATION',
       'CUSTOMER LIFETIME VALUE', 'INCOME', 'MONTHLY PREMIUM AUTO',
       'NUMBER OF OPEN COMPLAINTS', 'POLICY TYPE', 'VEHICLE CLASS',
       'TOTAL CLAIM AMOUNT']]

In [127]:
Mu



Unnamed: 0,REGION,REGION.1,STATE,GENDER,EDUCATION,CUSTOMER LIFETIME VALUE,INCOME,MONTHLY PREMIUM AUTO,NUMBER OF OPEN COMPLAINTS,POLICY TYPE,VEHICLE CLASS,TOTAL CLAIM AMOUNT
0,east,east,washington,,master,washington,0.0,1000.0,0.0,personal auto,four-door car,2.70
1,central,central,arizona,f,bachelors,arizona,0.0,94.0,0.0,personal auto,four-door car,1131.46
2,central,central,nevada,f,bachelors,nevada,48767.0,108.0,0.0,personal auto,two-door car,566.47
3,west region,west region,california,m,bachelors,california,0.0,106.0,0.0,corporate auto,suv,529.88
4,east,east,washington,m,high school or below,washington,36357.0,68.0,0.0,personal auto,four-door car,17.27
...,...,...,...,...,...,...,...,...,...,...,...,...
7064,west region,west region,california,f,college,california,47761.0,104.0,0.0,personal auto,four-door car,541.28
7065,west region,west region,california,m,bachelors,california,71941.0,73.0,0.0,personal auto,four-door car,198.23
7067,west region,west region,california,m,bachelors,california,0.0,85.0,3.0,corporate auto,four-door car,790.78
7068,west region,west region,california,m,college,california,21941.0,96.0,0.0,personal auto,four-door car,691.20
