In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data exploration and cleaning

The process of revieuwing and cleaning data to derive insights and generate hypotheses.

## 1. Get started 

Gather some basic information about the dataset.

In [None]:
# import
df = pd.read_csv('file.csv')

# What meta data do we have? How does the dataset looks like?
print(df.head())

# Are there missing values? What type of values are included?
df.info()
df.dtypes

### Explore numerical data

How is the data distributed? Some standaard describtions are given by .describe for the whole dataset.

In [None]:
df.describe()

In [None]:
sns.histplot(x = 'column', data = df, binwidth = 1)

### Explore categorical data

How many datapoints do we have in each catergory?

In [None]:
df['columnname'].value_counts()

## 2. Validate datatypes

Datatypes:
* str
* int
* float
* dict
* list
* bool

### Update type

In [None]:
# Update dtype
df['column'] = df['column'].astype(int)

# Check
df.dtypes

### Validate numerical data

In [None]:
# What is the datatype of column '..'
df.select_dtypes('number')

# Range
df['column'].min()
df['column'].max()

# Distribution
sns.boxplot(x = 'column', data = df)

### Validate categorical data

using .isin() return a serie with True and False. This can be used for filtering.

In [None]:
# Validate categorical data
df['column'].isin(["var1","var2"])
~df['column'].isin(["var1","var2"]) 

## 3. Explore subsets

bla bla

### Numerical data

...

### Categorical data

Use groups of data to describe a subset of data using:
* .group_by()

with an aggragating function
* .agg()
* .mean()
* .median()
* .min()
* .max()
* .sum()
* .count()
* .var()
* .std()

Or use an aggregating function for ungrouped data. 
* .agg(['func1', 'func2'])

This one is often used if we want to apply more than one function. The agg function only applies to numeric columns.

Visalization:
* barplot()

In [None]:
# grouped data
df.groupby('column').mean()
df.groupby('column').agg( name_col = ('column1', 'mean'),  (name_col2 = 'column1', 'std'), (name_col3 = ('column2', 'median') )

In [None]:
# ungrouped data
df.agg(['mean', 'std'])

# ungrouped data with diff agg functions for diff columns
df.agg({'column1': ['mean', 'var'], 'column2': ['median'])

In [None]:
# Visalization
sns.barplot(x = 'columnname', y = 'columnname', data = df)

## 4. Cleaning and imputation

Missing data affects distributions, and gives a less representatative view of the data and can result in drawing incorrect conclusions.

### Numerical data

Stragegies for adressing missing data, by:
* Drop missing values (<5% of total)
* impute mean, median, mode (depends on distribution and context). mode = central tendency
* Impute by sub-group

In [None]:
# Check missing values for each column
df.isna().sum()

In [None]:
# step 1: Filter columns with less than threshold (drop missing values)
threshold = len(df)*0.05
cols_to_drop = df.columns[df.isna().sum() <= threshold]
df.dropna(subset = cols_to_drop, inplace = True)

In [None]:
# step 2: Filter remaining columns with missing values (impute sum statistics)
cols_with_missing_values = df.columns[df.isna().sum() > 0]

# This examples uses the mode statistics to fill missing values
for col in cols_with_missing_values[:,-1]:
    df[col].fillna(df[col].mode()[0])

In [None]:
# step interm: Can you remove a whole column which has no influence on the missing data?
df['columnname'].value_counts()

In [None]:
# Check remaining missing values for each column
df.isna().sum()

In [None]:
# Step 3: Add data using impute data of subgroups
df_dict = df.groupby('columname1')['columnanme2'].median().to_dict()
df['columnname2'] = df['columnname2'].fillna(df['columname1'].map(df_dict))

# df['columnname2'].fillna(df['columname1'].map(df_dict)) --> fill NaN values with using a dictionary. It searches for the key [columname1] and gives the value for [columname2]

In [None]:
# Check  missing values for each column again
df.isna().sum()

### Categorical data

Previewing only categorical data:

In [None]:
df.select_dtypes("objects").head()

Examine the frequency of columns

In [None]:
df['columnname'].unique()

In [None]:
df['columnname'].value_counts()

A dataset can contain various names to describe a simiar category. These namings should be cleaned to the desired categorynames. 

Step 1: Make a list of new categories

In [None]:
new_category = ["cat1", "cat2", "cat3"]

step 2: Define phrases in strings
* description1 = "var_interest_1 | var_interest_2"
* description2 = "var_interest_1 | var_interest_2 | var_interest_3"
* description3 = "var_interest_1"
* etc.

In [None]:
description1 = "var_interest_1 | var_interest_2"
description2 = "var_interest_1 | var_interest_2 | var_interest_3"
description3 = "var_interest_1"

step 3: Create conditions
* Use seach method for specific strings
* Store those in a variable 

In [None]:
# Seach for specific string
df['column'].str.constains('var_interest')

In [None]:
# Seach for specific multiple strings
df['column'].str.constains('var_interest_1 | var_interest_2')

In [None]:
# Store conditions in a list
conditions = [(df['column'].str.constains('var_interest_1 | var_interest_2'),
               df['column'].str.constains('var_interest_1 | var_interest_2'),
               df['column'].str.constains('var_interest_1 | var_interest_2'))]

step 4: Create a new category column

In [None]:
# New column with condition applied
df.['new_columname'] = np.select(conditions, new_category, default = "Other")