In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data exploration and cleaning

The process of revieuwing and cleaning data to derive insights and generate hypotheses.

## 1. Get started 

Gather some basic information about the dataset.

In [None]:
# import
df = pd.read_csv('file.csv')

# What meta data do we have? How does the dataset looks like?
print(df.head())

# Are there missing values? What type of values are included?
df.info()
df.dtypes

### Explore numerical data

How is the data distributed? Some standaard describtions are given by .describe for the whole dataset.

In [None]:
df.describe()

In [None]:
sns.histplot(x = 'column', data = df, binwidth = 1)

### Explore categorical data

How many datapoints do we have in each catergory?

In [None]:
df['columnname'].value_counts()

## 2. Validate and update datatypes

Datatypes:
* str
* int
* float
* dict
* list
* bool

Identify the column types.

In [None]:
# Check data types
df.dtypes

Is there something out of ordinary? The identification of the column types gives a first glimps of the data and what to clean.

### Validate numerical data

Identify the columns belonging to numerical data.

In [None]:
# Select columns that are numerical
df.select_dtypes('number')

In [None]:
# Range
df['column'].min()
df['column'].max()

In [None]:
# Distribution
sns.boxplot(x = 'column', data = df)

Is there a column missing as numeric column? Then you have replace the type of the column.

In [None]:
# Convert string to numbers (general)
pd.Series.str.replace('char2remove', 'char2replacewith')

In [None]:
# Remove , and replace with an empty string (example) 
df['column'] = df['column'].str.replace(',','')

Update data type

In [None]:
# Update dtype (general)
df['column'] = df['column'].astype(new_datatype)

In [None]:
# Update dtype (example)
df['column'] = df['column'].astype(int)

### Validate categorical data

Identify the columns belonging to categorical data.

In [None]:
# Select columns that are categorical
df.select_dtypes('object')

Using .isin() return a serie with True and False. This can be used for filtering.

In [None]:
# Validate categorical data
df['column'].isin(["var1","var2"])
~df['column'].isin(["var1","var2"]) 

## 3. Explore subsets

bla bla

### Numerical data

...

In [None]:
# grouped data
df.groupby('column')['column2'].mean()

In [None]:
# Add summary statistic as a column to the dataset (example)
df['new_column'] = df.groupby('column')['column'].transform(lambda x: x.std())

### Categorical data

Use groups of data to describe a subset of data using:
* .group_by()

with an aggragating function (only applies to numeric columns)
* .agg()
* .mean()
* .median()
* .min()
* .max()
* .sum()
* .count()
* .var()
* .std()

Aggregating functions can also apply for ungrouped data. This is often used if we want to apply more than one function. 

In [None]:
# grouped data
df.groupby('column').mean()
df.groupby('column').agg( name_col = ('column1', 'mean'),  (name_col2 = 'column1', 'std'), (name_col3 = ('column2', 'median') )

In [None]:
# ungrouped data
df.agg(['mean', 'std'])

# ungrouped data with diff agg functions for diff columns
df.agg({'column1': ['mean', 'var'], 'column2': ['median'])

In [None]:
# Visalization
sns.barplot(x = 'column', y = 'column', data = df)

## 4. Cleaning and imputation

Missing data affects distributions, and gives a less representatative view of the data and can result in drawing incorrect conclusions.

### Numerical data

Stragegies for adressing missing data, by:
* Drop missing values (<5% of total)
* impute mean, median, mode (depends on distribution and context). mode = central tendency
* Impute by sub-group

In [None]:
# Check missing values for each column
df.isna().sum()

In [None]:
# step 1: Filter columns with less than threshold (drop missing values)
threshold = len(df)*0.05
cols_to_drop = df.columns[df.isna().sum() <= threshold]
df.dropna(subset = cols_to_drop, inplace = True)

In [None]:
# step 2: Filter remaining columns with missing values (impute sum statistics)
cols_with_missing_values = df.columns[df.isna().sum() > 0]

# This examples uses the mode statistics to fill missing values
for col in cols_with_missing_values[:,-1]:
    df[col].fillna(df[col].mode()[0])

In [None]:
# step interm: Can you remove a whole column which has no influence on the missing data?
df['columnname'].value_counts()

In [None]:
# Check remaining missing values for each column
df.isna().sum()

In [None]:
# Step 3: Add data using impute data of subgroups
df_dict = df.groupby('columname1')['columnanme2'].median().to_dict()
df['columnname2'] = df['columnname2'].fillna(df['columname1'].map(df_dict))

# df['columnname2'].fillna(df['columname1'].map(df_dict)) --> fill NaN values with using a dictionary. It searches for the key [columname1] and gives the value for [columname2]

In [None]:
# Check  missing values for each column again
df.isna().sum()

### Categorical data

Previewing only categorical data:

In [None]:
df.select_dtypes("objects").head()

Examine the frequency of columns

In [None]:
df['columnname'].unique()

In [None]:
df['columnname'].value_counts()

In [None]:
# print the number of unique values for each object column
non_numeric = df.select_dtypes("object")
for col in non_numeric.columns:
    # unique counts
    print(f"Number of unique values in {col} column: ", len(non_numeric[col].nunique()) )
    # unique value count within each category
    print(f"Number of unique values in {col} column: ", non_numeric[col].value_counts() )

A dataset can contain various names to describe a simiar category. These namings should be cleaned to the desired categorynames. 

Step 1: Make a list of new categories

In [None]:
new_category = ["cat1", "cat2", "cat3"]

step 2: Define phrases in strings

In [None]:
description1 = "var_interest_1 | var_interest_2"
description2 = "var_interest_1 | var_interest_2 | var_interest_3"
description3 = "var_interest_1"

step 3: Create conditions
* Use seach method for specific strings
* Store those in a variable 

In [None]:
# Seach for specific string
df['column'].str.constains('var_interest')

In [None]:
# Seach for specific multiple strings
df['column'].str.constains('var_interest_1 | var_interest_2')

In [None]:
# Store conditions in a list
conditions = [(df['column'].str.constains(description1),
               df['column'].str.constains(description2),
               df['column'].str.constains(description3))]

step 4: Create a new category column

In [None]:
# New column with condition applied
df.['new_columname'] = np.select(conditions, new_category, default = "Other")

step 5: Verify the variety of values in a column

In [None]:
# Distribution
sns.countplot(x = 'column', data = df)