# Data Wrangling

## Data Cleansing

#### Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('water_potability.csv')

In [3]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


#### All columns are relevant to scope of project - no column drop is needed
#### Search for and remove any duplicates in data

In [36]:
df.duplicated().value_counts()

False    3276
dtype: int64

#### Check data types and convert attribute types if needed

In [38]:
df.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

#### Check for missing data

In [41]:
# Check entire dataframe for missing values
df.isna().values.any()

True

In [42]:
# Check which columns are missing values
df.isna().any()

ph                  True
Hardness           False
Solids             False
Chloramines        False
Sulfate             True
Conductivity       False
Organic_carbon     False
Trihalomethanes     True
Turbidity          False
Potability         False
dtype: bool

In [43]:
# Check the total number of missing values in each column
df.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [46]:
# Check how many total rows are missing values
df.isna().any(axis=1).sum()

1265

#### 1265 rows with missing values is > 1/3 of the dataset, so removing all rows with missing data is not advised.
#### Each row is an independent water source, so interpolation will not be accurate
#### Try cross-referencing other datasets

## Feature Engineering
#### Goal: Make explanatory variables (the physical and chemical characteristics) better suited to the outcome variable (water potability)

#### Check data types and separate numeric and non-numeric data

In [5]:
df.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

#### All attributes are numeric
#### Split numeric attributes into continuous and noncontinuous data by counting each attribute's unique values

In [11]:
df['ph'].agg(['nunique','count','size'])

nunique    2785
count      2785
size       3276
Name: ph, dtype: int64

In [12]:
df['Hardness'].agg(['nunique','count','size'])

nunique    3276
count      3276
size       3276
Name: Hardness, dtype: int64

In [13]:
df['Solids'].agg(['nunique','count','size'])

nunique    3276
count      3276
size       3276
Name: Solids, dtype: int64

In [14]:
df['Chloramines'].agg(['nunique','count','size'])

nunique    3276
count      3276
size       3276
Name: Chloramines, dtype: int64

In [15]:
df['Sulfate'].agg(['nunique','count','size'])

nunique    2495
count      2495
size       3276
Name: Sulfate, dtype: int64

In [16]:
df['Conductivity'].agg(['nunique','count','size'])

nunique    3276
count      3276
size       3276
Name: Conductivity, dtype: int64

In [17]:
df['Organic_carbon'].agg(['nunique','count','size'])

nunique    3276
count      3276
size       3276
Name: Organic_carbon, dtype: int64

In [18]:
df['Trihalomethanes'].agg(['nunique','count','size'])

nunique    3114
count      3114
size       3276
Name: Trihalomethanes, dtype: int64

In [19]:
df['Turbidity'].agg(['nunique','count','size'])

nunique    3276
count      3276
size       3276
Name: Turbidity, dtype: int64

In [20]:
df['Potability'].agg(['nunique','count','size'])

nunique       2
count      3276
size       3276
Name: Potability, dtype: int64

#### All attributes besides potability are continuous. Potability is binary.

#### First, handle continuous variables

In [33]:
cont_df = df.drop(columns=['Potability'])
cont_df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075
