In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

**IMPORTING DATABASE**

In [2]:
data = pd.read_csv('insurance.csv', sep=',')

In [3]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

**NO NANS, GOOD!**

**LETS GET THE 'CHARGE' COLUMN FIRST IN DATASET SINCE IT IS MEANT TO BE Y VALUE**

In [4]:
charge = data.pop('charges')
data.insert(0, charge.name, charge)
data

Unnamed: 0,charges,age,sex,bmi,children,smoker,region
0,16884.92400,19,female,27.900,0,yes,southwest
1,1725.55230,18,male,33.770,1,no,southeast
2,4449.46200,28,male,33.000,3,no,southeast
3,21984.47061,33,male,22.705,0,no,northwest
4,3866.85520,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...,...
1333,10600.54830,50,male,30.970,3,no,northwest
1334,2205.98080,18,female,31.920,0,no,northeast
1335,1629.83350,18,female,36.850,0,no,southeast
1336,2007.94500,21,female,25.800,0,no,southwest


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   charges   1338 non-null   float64
 1   age       1338 non-null   int64  
 2   sex       1338 non-null   object 
 3   bmi       1338 non-null   float64
 4   children  1338 non-null   int64  
 5   smoker    1338 non-null   object 
 6   region    1338 non-null   object 
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


**LETS CONVERT ALL STRING VALUES INTO NUMERICS**

**1. SEX**

In [6]:
sex = data.pop('sex')
binary_sex = sex.str.lower().str.replace('female', '0', regex=False).str.replace('male', '1', regex=False)
data.insert(2, 'sex', binary_sex)
data

Unnamed: 0,charges,age,sex,bmi,children,smoker,region
0,16884.92400,19,0,27.900,0,yes,southwest
1,1725.55230,18,1,33.770,1,no,southeast
2,4449.46200,28,1,33.000,3,no,southeast
3,21984.47061,33,1,22.705,0,no,northwest
4,3866.85520,32,1,28.880,0,no,northwest
...,...,...,...,...,...,...,...
1333,10600.54830,50,1,30.970,3,no,northwest
1334,2205.98080,18,0,31.920,0,no,northeast
1335,1629.83350,18,0,36.850,0,no,southeast
1336,2007.94500,21,0,25.800,0,no,southwest


**2. SMOKER**

In [7]:
smoker = data.pop('smoker')
binary_smoker = smoker.str.lower().str.replace('no', '0', regex=False).str.replace('yes', '1', regex=False)
data.insert(5, 'smoker', binary_smoker)
data

Unnamed: 0,charges,age,sex,bmi,children,smoker,region
0,16884.92400,19,0,27.900,0,1,southwest
1,1725.55230,18,1,33.770,1,0,southeast
2,4449.46200,28,1,33.000,3,0,southeast
3,21984.47061,33,1,22.705,0,0,northwest
4,3866.85520,32,1,28.880,0,0,northwest
...,...,...,...,...,...,...,...
1333,10600.54830,50,1,30.970,3,0,northwest
1334,2205.98080,18,0,31.920,0,0,northeast
1335,1629.83350,18,0,36.850,0,0,southeast
1336,2007.94500,21,0,25.800,0,0,southwest


**3. REGION**

In [8]:
data['region'].unique().tolist()

['southwest', 'southeast', 'northwest', 'northeast']

In [9]:
def encoder(instance: str, lst: list):
    return lst.index(instance) + 1

In [10]:
data['region'] = data['region'].apply(encoder, args=(data['region'].unique().tolist(),))

In [11]:
data

Unnamed: 0,charges,age,sex,bmi,children,smoker,region
0,16884.92400,19,0,27.900,0,1,1
1,1725.55230,18,1,33.770,1,0,2
2,4449.46200,28,1,33.000,3,0,2
3,21984.47061,33,1,22.705,0,0,3
4,3866.85520,32,1,28.880,0,0,3
...,...,...,...,...,...,...,...
1333,10600.54830,50,1,30.970,3,0,3
1334,2205.98080,18,0,31.920,0,0,4
1335,1629.83350,18,0,36.850,0,0,2
1336,2007.94500,21,0,25.800,0,0,1


In [12]:
data = data.astype({'sex': int, 'smoker': int, 'region': int})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   charges   1338 non-null   float64
 1   age       1338 non-null   int64  
 2   sex       1338 non-null   int32  
 3   bmi       1338 non-null   float64
 4   children  1338 non-null   int64  
 5   smoker    1338 non-null   int32  
 6   region    1338 non-null   int32  
dtypes: float64(2), int32(3), int64(2)
memory usage: 57.6 KB


In [13]:
data.to_csv('database.csv', index=False)