In [4]:
#Trend Analysis
Description:
This line chart illustrates the trends in dissolved oxygen levels over time at specific sampling locations. 
Each data point represents a recorded measurement, and the line connects these points to reveal temporal 

patterns in water quality.
import pandas as pd
import altair as alt
import numpy as np

data = pd.DataFrame({
    'sample_date': pd.to_datetime(
        ['1998-01-01', '1998-01-10', '1998-01-20', '1998-01-30', '1998-02-10']
    ),
    'location': ['Boonsri'] * 5,
    'measure': ['Dissolved oxygen'] * 5,
    'value': [6.8, 6.5, 7.0, 6.9, 7.2]
})

trend_chart = alt.Chart(data).mark_line(point=True).encode(
    x='sample_date:T',
    y='value:Q',
    color='location:N',
    tooltip=['sample_date:T', 'value:Q', 'location:N']
).properties(
    title='Trends in Dissolved Oxygen Over Time'
)

trend_chart

Purpose:
The chart provides insights into how dissolved oxygen levels vary over time, helping to identify periods of stability, 
improvement, or potential deterioration in water quality.
It supports the analysis of ecological health and identifies temporal factors that may impact aquatic ecosystems.


In [1]:
#2. Time Series Heatmap for Chemical Trends
Description: This heatmap shows the trends in chemical concentration over time 
and across sensor locations. It allows you to quickly identify areas with high contamination.

import altair as alt
data = pd.DataFrame({
    'sensor_location': ['Boonsri', 'Chai', 'Kannika', 'Somchair'] * 10,
    'date': pd.date_range('2024-01-01', periods=10).tolist() * 4,
    'chemical': ['Nitrates', 'Ammonium', 'Potassium', 'Phosphorus'] * 10,
    'concentration': [0.5, 1.2, 0.7, 0.3] * 10
})

heatmap = alt.Chart(data).mark_rect().encode(
    x='date:T',
    y='sensor_location:N',
    color=alt.Color('concentration:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['sensor_location', 'date', 'concentration']
).facet(
    column='chemical:N'
).properties(
    title='Chemical Concentration Trends Over Time'
).resolve_scale(
    color='independent'
)

heatmap

Purpose: Displays how contamination levels vary over time and across locations for different chemicals.
Features:
Facet by chemical type for clarity.
Color intensity indicates concentration levels.
Tooltip provides detailed values for exploration.


In [22]:
#3. Anomalies Analysis
Objective:
Detect outliers or unusual deviations from the expected water quality metrics.
Approach:
Use statistical methods like Z-scores or Interquartile Range (IQR) to identify anomalies.
    
# Calculating Z-scores
data['z_score'] = (data['value'] - data['value'].mean()) / data['value'].std()

# Flag Anomalies
data['is_anomaly'] = np.abs(data['z_score']) > 2

# Highlighting Anomalies in a Scatter Plot
anomaly_chart = alt.Chart(data).mark_circle(size=100).encode(
    x='sample_date:T',
    y='value:Q',
    color=alt.condition(
        alt.datum.is_anomaly,
        alt.value('red'),  # Color anomalies red
        alt.value('blue')  # Normal points blue
    ),
    tooltip=['sample_date:T', 'value:Q', 'is_anomaly']
).properties(
    title='Anomalies in Dissolved Oxygen Levels'
)

anomaly_chart

Red points indicate anomalies where the Z-score is beyond ±2.
Useful for identifying unexpected spikes or drops in water quality metrics.

In [2]:
#Visual highlighting anomalies based on a threshold value of 7 

scatter_plot = alt.Chart(data).mark_circle(size=100).encode(
    x='sample_date:T',
    y='value:Q',
    color=alt.condition(
        alt.datum.value > 7, 
        alt.value('red'),  
        alt.value('blue')  
    ),
    tooltip=['sample_date:T', 'value:Q', 'location:N']
).properties(
    title='Scatter Plot of Dissolved Oxygen Levels with Anomalies'
)

scatter_plot


In [24]:
#Overlaying Trend Lines and Anomalies
data = pd.DataFrame({
    'sample_date': pd.to_datetime(
        ['1998-01-01', '1998-01-10', '1998-01-20', '1998-01-30', '1998-02-10']
    ),
    'location': ['Boonsri'] * 5,
    'measure': ['Dissolved oxygen'] * 5,
    'value': [6.8, 6.5, 7.0, 6.9, 7.2]
})
data['z_score'] = (data['value'] - data['value'].mean()) / data['value'].std()
data['is_anomaly'] = np.abs(data['z_score']) > 2

base = alt.Chart(data).mark_circle(size=100).encode(
    x='sample_date:T',
    y='value:Q',
    tooltip=['sample_date:T', 'value:Q', 'is_anomaly']
)
anomalies = base.encode(
    color=alt.condition(
        alt.datum.is_anomaly,
        alt.value('red'),  # Anomalies in red
        alt.value('blue')  # Normal points in blue
    )
)
trend_line = alt.Chart(data).transform_regression(
    'sample_date', 'value'
).mark_line(color='green').encode(
    x='sample_date:T',
    y='value:Q'
)

final_chart = (anomalies + trend_line).properties(
    title='Overlay of Trends and Anomalies in Dissolved Oxygen'
)

final_chart

Scatter Plot:

Displays individual data points.
Red points represent anomalies (values with Z-scores > ±2).
Blue points represent normal data.
Trend Line:

A regression line in green shows the overall trend in the data.
Overlay:

Combines anomalies and trends in a single visualization for clarity.Visualization Features:
The chart highlights long-term patterns (trend line) and deviations (anomalies).
Tooltips allow for interaction and detailed exploration of each data point.

#Visualization Features:
The chart highlights long-term patterns (trend line) and deviations (anomalies).
Tooltips allow for interaction and detailed exploration of each data point.

In [9]:
#4. Missing data Analysis
import pandas as pd
import altair as alt
data = pd.DataFrame({
    'location': ['Boonsri', 'Achara', 'Somchair', 'Becha', 'Tansanee', 'Chai', 'Sakda', 'Kannika'],
    'sample_date': pd.to_datetime(['1998-01-11'] * 8),
    'measure': [
        'Water temperature', 'Dissolved oxygen', 'Ammonium',
        'Nitrites', 'Nitrates', 'Orthophosphate-phosphorus',
        'Total phosphorus', 'Sodium'
    ],
    'value': [2, None, 0.33, 0.01, 1.47, 0.06, 0.09, None]
})

data['missing'] = data['value'].isna()

missing_chart = alt.Chart(data).mark_rect().encode(
    x='location:N',
    y='measure:N',
    color=alt.Color('missing:Q', scale=alt.Scale(domain=[0, 1], range=['green', 'red'])),
    tooltip=['location:N', 'measure:N', 'missing']
).properties(
    title='Missing Data Matrix'
)

missing_chart


In [11]:

time_data = pd.DataFrame({
    'date': pd.date_range('1998-01-01', '1998-01-15'),
    'location': ['Boonsri'] * 15
})

merged = time_data.merge(data, left_on=['date', 'location'], right_on=['sample_date', 'location'], how='left')

merged['missing'] = merged['value'].isna()

# Line Chart to Show Temporal Gaps
temporal_gap_chart = alt.Chart(merged).mark_line().encode(
    x='date:T',
    y=alt.Y('value:Q', scale=alt.Scale(domain=[0, 3])),
    color=alt.Color('missing:Q', scale=alt.Scale(domain=[0, 1], range=['blue', 'red'])),
    tooltip=['date:T', 'value:Q', 'missing']
).properties(
    title='Temporal Gaps in Data'
)

temporal_gap_chart


In [25]:

duplicates = data[data.duplicated(subset=['location', 'sample_date', 'measure'], keep=False)]
duplicates


Unnamed: 0,sample_date,location,measure,value,z_score,is_anomaly


In [13]:
summary = {
    'Total Records': len(data),
    'Missing Values': data['missing'].sum(),
    'Outliers': data['outlier'].sum(),
    'Duplicate Records': len(duplicates)
}

summary_df = pd.DataFrame(summary, index=[0])
summary_df


Unnamed: 0,Total Records,Missing Values,Outliers,Duplicate Records
0,8,2,0,0
