In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data exploration

The process of revieuwing and cleaning data to derive insights and generate hypotheses.

## 1. Get started 

Gather some basic information about the dataset.

In [None]:
# import
df = pd.read_csv('file.csv')

# What meta data do we have? How does the dataset looks like?
print(df.head())

# Are there missing values? What type of values are included?
df.info()
df.dtypes

Usually datetime columns are imported and read as objects or strings. But you want to convert those columns back to datetime as it has many advangtages to work with datetime objectes.

In [None]:
# Import data with datacolumn as date (instead of string/object)
df = pd.read_csv('file.csv', parse_dates = ['date_column'])

# Or afterwards
df['date_column'] = pd.to_datetime(df['date_column'])

# Or afterwards combining (if month, date, year were separate columns)
df['date_column'] = pd.to_datetime(df[['month','day','year']])

# Only extract month, day, or year
df['date_column'] = pd.to_datetime(df['date_column']).dt.month
df['date_column'] = pd.to_datetime(df['date_column']).dt.day
df['date_column'] = pd.to_datetime(df['date_column']).dt.year

## 2. Description of the data

### Explore numerical data

How is the data distributed? Some standaard describtions are given by .describe for the whole dataset.

In [None]:
df['columnname'].describe()

### Explore numerical data -- outliers

In [None]:
sns.histplot(x = 'column', data = df, binwidth = 1)

Determine outliers:
* Upper outliers >75th percentile + (1.5*IQR)
* Lower outliers <75th percentile - (1.5*IQR)

In [None]:
# 75th percentile
seventy_fifth = df['column'].quantile(0.75)

# 25th percentile
twenty_fifth = df['column'].quantile(0.25)

In [None]:
# IQR
df_IQR = seventy_fifth - twenty_fifth

In [None]:
# Upper threshold
upper = seventy_fifth + (1.5* df_IQR)

# Lower threshold
lower = twenty_fifth - (1.5* df_IQR)

In [None]:
# Select outliers
df[df['column'] < lower | df['column'] > upper]['column', 'column', 'column']

### Explore categorical data

How many datapoints do we have in each catergory?

In [None]:
# Absolute numbers
df['columnname'].value_counts()

In [None]:
# Relative frequencies
df['columnname'].value_counts(normalize = True)

Cross tabulation enables to examine the frequency of combinations of classes.

In [None]:
# Cross tabulation
pd.crosstab(df['columnname_index'], df['columnname1'])

In [None]:
# Cross tabulation with calculation 
pd.crosstab(df['columnname_index'], df['columnname1'], values = df['columnname2'], aggfunc = 'median')

## 3. Validate and update datatypes

Datatypes:
* str
* int
* float
* dict
* list
* bool

Identify the column types.

In [None]:
# Check data types
df.dtypes

Is there something out of ordinary? The identification of the column types gives a first glimps of the data and what to clean.

### Validate numerical data

Identify the columns belonging to numerical data.

In [None]:
# Select columns that are numerical
df.select_dtypes('number')

In [None]:
# Range
df['column'].min()
df['column'].max()

In [None]:
# Distribution
sns.boxplot(x = 'column', data = df)

Is there a column missing as numeric column? Then you have replace the type of the column.

In [None]:
# Convert string to numbers (general)
pd.Series.str.replace('char2remove', 'char2replacewith')

In [None]:
# Remove , and replace with an empty string (example) 
df['column'] = df['column'].str.replace(',','')

Update data type

In [None]:
# Update dtype (general)
df['column'] = df['column'].astype(new_datatype)

In [None]:
# Update dtype (example)
df['column'] = df['column'].astype(int)

### Validate categorical data

Identify the columns belonging to categorical data.

In [None]:
# Select columns that are categorical
df.select_dtypes('object')

Using .isin() return a serie with True and False. This can be used for filtering.

In [None]:
# Validate categorical data
df['column'].isin(["var1","var2"])
~df['column'].isin(["var1","var2"]) 

## 4. Explore subsets

bla bla

### Numerical data

...

In [None]:
# grouped data
df.groupby('column')['column2'].mean()

In [None]:
# Add summary statistic as a column to the dataset (example)
df['new_column'] = df.groupby('column')['column'].transform(lambda x: x.std())

# or (similar result)
df['new_column'] = df.groupby('column')['column'].transform('std')

### Categorical data

Use groups of data to describe a subset of data using:
* .group_by()

with an aggragating function (only applies to numeric columns)
* .agg()
* .mean()
* .median()
* .min()
* .max()
* .sum()
* .count()
* .var()
* .std()

Aggregating functions can also apply for ungrouped data. This is often used if we want to apply more than one function. 

In [None]:
# grouped data
df.groupby('column').mean()
df.groupby('column').agg( name_col = ('column1', 'mean'),  name_col2 = ('column1', 'std'), name_col3 = ('column2', 'median') )

In [None]:
# ungrouped data
df.agg(['mean', 'std'])

# ungrouped data with diff agg functions for diff columns
df.agg({'column1': ['mean', 'var'], 'column2': ['median'])

In [None]:
# Visalization
sns.barplot(x = 'column', y = 'column', data = df)