# Google Trends Data Analysis for Disease Monitoring

This notebook demonstrates how Google Trends data can enhance disease monitoring and prediction capabilities.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [11]:
# Load the integrated data
try:
    integrated_data = pd.read_csv('../artifacts/integrated/integrated_disease_trends_data.csv')
    print("Loaded integrated data")
except FileNotFoundError:
    print("Integrated data not found. Please run the integration script first.")
    # Load raw data for demonstration
    trends_data = pd.read_csv('../artifacts/time_series/google_trends_time_series.csv')
    print("Loaded raw trends data")
    trends_data.head()

Loaded integrated data


## 1. Understanding Google Trends Data

Google Trends provides search interest data that can serve as a proxy for public awareness and concern about diseases.

In [12]:
# # Analyze trends data
# trends_data = pd.read_csv('../artifacts/time_series/google_trends_time_series.csv')
# trends_data['Month'] = pd.to_datetime(trends_data['Month'] + '-01')

# # Plot trends over time
# plt.figure(figsize=(12, 6))
# plt.plot(trends_data['Month'], trends_data['diabetes_trend'], 
#          marker='o', linewidth=2, label='Diabetes Search Interest')
# plt.plot(trends_data['Month'], trends_data['malaria_trend'], 
#          marker='s', linewidth=2, label='Malaria Search Interest')

# plt.title('Google Search Interest Trends for Diabetes vs Malaria', fontsize=14)
# plt.xlabel('Time Period')
# plt.ylabel('Search Interest (0-100)')
# plt.legend()
# plt.grid(True, alpha=0.3)
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

# # Summary statistics
# print("\nTrends Summary Statistics:")
# print(trends_data[['diabetes_trend', 'malaria_trend']].describe())

import plotly.express as px
import pandas as pd
# If not already loaded
trends_data = pd.read_csv('../artifacts/time_series/google_trends_time_series.csv')
trends_data['Month'] = pd.to_datetime(trends_data['Month'] + '-01')

# Melt the dataframe for Plotly
df_melt = trends_data.melt(id_vars='Month', value_vars=['diabetes_trend', 'malaria_trend'],
                           var_name='Disease', value_name='Search Interest')

# Rename for better legend
df_melt['Disease'] = df_melt['Disease'].map({
    'diabetes_trend': 'Diabetes Search Interest',
    'malaria_trend': 'Malaria Search Interest'
})

fig = px.line(
    df_melt,
    x='Month',
    y='Search Interest',
    color='Disease',
    markers=True,
    title='Google Search Interest Trends for Diabetes vs Malaria',
    labels={'Month': 'Time Period', 'Search Interest': 'Search Interest (0-100)'}
)

fig.update_layout(
    xaxis=dict(
        tickformat='%Y-%m',
        tickangle=45
    ),
    legend=dict(
        x=0.7, y=0.95,
        bgcolor='rgba(0,0,0,0)'
    )
)

fig.show()

## 2. Key Applications of Google Trends Data

### A. Early Warning System
Google Trends can detect increased public interest before official case reports.

In [13]:
# Calculate trend changes and identify spikes
trends_data['diabetes_change'] = trends_data['diabetes_trend'].diff()
trends_data['malaria_change'] = trends_data['malaria_trend'].diff()

# Identify significant spikes (above 1 standard deviation)
diabetes_threshold = trends_data['diabetes_change'].std()
malaria_threshold = trends_data['malaria_change'].std()

diabetes_spikes = trends_data[trends_data['diabetes_change'] > diabetes_threshold]
malaria_spikes = trends_data[trends_data['malaria_change'] > malaria_threshold]

print("Significant spikes in search interest:")
print(f"\nDiabetes spikes:")
for _, row in diabetes_spikes.iterrows():
    print(f"  {row['Month'].strftime('%Y-%m')}: +{row['diabetes_change']:.2f}")
    
print(f"\nMalaria spikes:")
for _, row in malaria_spikes.iterrows():
    print(f"  {row['Month'].strftime('%Y-%m')}: +{row['malaria_change']:.2f}")

Significant spikes in search interest:

Diabetes spikes:
  2022-09: +4.30
  2024-06: +3.51

Malaria spikes:
  2022-09: +2.07
  2023-06: +2.38


### B. Seasonal Pattern Analysis
Understanding seasonal variations in disease awareness.

In [14]:
# # Add seasonal features
# trends_data['quarter'] = trends_data['Month'].dt.quarter
# trends_data['month'] = trends_data['Month'].dt.month

# # Analyze seasonal patterns
# quarterly_avg = trends_data.groupby('quarter')[['diabetes_trend', 'malaria_trend']].mean()

# plt.figure(figsize=(10, 5))
# quarterly_avg.plot(kind='bar', ax=plt.gca())
# plt.title('Average Search Interest by Quarter')
# plt.xlabel('Quarter')
# plt.ylabel('Average Search Interest')
# plt.legend()
# plt.tight_layout()
# plt.show()

# print("\nQuarterly averages:")
# print(quarterly_avg)

trends_data['quarter'] = trends_data['Month'].dt.quarter
quarterly_avg = trends_data.groupby('quarter')[['diabetes_trend', 'malaria_trend']].mean().reset_index()

quarterly_avg_long = quarterly_avg.melt(id_vars='quarter', 
                                        value_vars=['diabetes_trend', 'malaria_trend'],
                                        var_name='Disease', value_name='Average Search Interest')

# Rename for better legend
quarterly_avg_long['Disease'] = quarterly_avg_long['Disease'].map({
    'diabetes_trend': 'Diabetes Search Interest',
    'malaria_trend': 'Malaria Search Interest'
})

fig = px.bar(
    quarterly_avg_long,
    x='quarter',
    y='Average Search Interest',
    color='Disease',
    barmode='group',
    labels={'quarter': 'Quarter'},
    title='Average Search Interest by Quarter (Interactive)'
)

fig.show()

### C. Correlation with Health Outcomes
Analyzing the relationship between search interest and actual health data.

In [15]:
# Load health data for correlation analysis
try:
    diabetes_health = pd.read_csv('../artifacts/csv/diabetes.csv')
    
    # Create a simple correlation analysis
    # Note: This is a simplified example - real correlation would need proper time alignment
    
    print("Health Data Summary:")
    print(diabetes_health.head())
    
    # Calculate correlation between trends and health metrics
    # This would require proper time-series alignment in a real scenario
    
except FileNotFoundError:
    print("Health data not available for correlation analysis")

Health Data Summary:
   periodname  Diabetes Mellitus  Diabetes Mellitus +AC0- Female  \
0        2020             174159                          113947   
1        2021             203453                          133763   
2        2022             193130                          126411   
3        2023             182081                          118749   
4        2024             199336                          129280   

   Diabetes Mellitus +AC0- Male  Diabetes mellitus deaths  \
0                         60212                       983   
1                         69690                      1568   
2                         66719                      1653   
3                         63332                      2306   
4                         70056                       812   

   Diabetes mellitus lab confirmed cases  
0                                  30790  
1                                  34143  
2                                  43080  
3                              

## 3. Predictive Modeling with Google Trends

Google Trends data can be used to predict future disease patterns.

In [16]:
# Simple trend prediction model
def create_trend_prediction_model(trends_data, disease='diabetes'):
    """Create a simple linear trend prediction model"""
    
    # Prepare features
    X = np.arange(len(trends_data)).reshape(-1, 1)
    y = trends_data[f'{disease}_trend'].values
    
    # Fit linear model
    model = LinearRegression()
    model.fit(X, y)
    
    # Predict next 3 months
    future_X = np.arange(len(trends_data), len(trends_data) + 3).reshape(-1, 1)
    future_predictions = model.predict(future_X)
    
    return model, future_predictions, model.score(X, y)

# Create prediction models
diabetes_model, diabetes_pred, diabetes_r2 = create_trend_prediction_model(trends_data, 'diabetes')
malaria_model, malaria_pred, malaria_r2 = create_trend_prediction_model(trends_data, 'malaria')

print(f"Diabetes trend prediction R² score: {diabetes_r2:.3f}")
print(f"Malaria trend prediction R² score: {malaria_r2:.3f}")
print(f"\nPredicted diabetes trends (next 3 months): {diabetes_pred}")
print(f"Predicted malaria trends (next 3 months): {malaria_pred}")

Diabetes trend prediction R² score: 0.023
Malaria trend prediction R² score: 0.005

Predicted diabetes trends (next 3 months): [87.63416422 87.55492894 87.47569366]
Predicted malaria trends (next 3 months): [12.00571848 11.98645763 11.96719678]


## 4. Public Health Insights

### A. Awareness Campaign Effectiveness
Monitor the impact of public health campaigns on search behavior.

In [17]:
# # Analyze awareness patterns
# plt.figure(figsize=(12, 8))

# # Subplot 1: Raw trends
# plt.subplot(2, 1, 1)
# plt.plot(trends_data['Month'], trends_data['diabetes_trend'], 
#          label='Diabetes', linewidth=2)
# plt.plot(trends_data['Month'], trends_data['malaria_trend'], 
#          label='Malaria', linewidth=2)
# plt.title('Disease Awareness Trends Over Time')
# plt.ylabel('Search Interest')
# plt.legend()
# plt.grid(True, alpha=0.3)

# # Subplot 2: Trend changes
# plt.subplot(2, 1, 2)
# plt.plot(trends_data['Month'], trends_data['diabetes_change'], 
#          label='Diabetes Change', alpha=0.7)
# plt.plot(trends_data['Month'], trends_data['malaria_change'], 
#          label='Malaria Change', alpha=0.7)
# plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
# plt.title('Month-over-Month Changes in Search Interest')
# plt.ylabel('Change in Search Interest')
# plt.xlabel('Time Period')
# plt.legend()
# plt.grid(True, alpha=0.3)

# plt.tight_layout()
# plt.show()

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots: 2 rows, 1 column
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    vertical_spacing=0.12,
    subplot_titles=(
        "Disease Awareness Trends Over Time",
        "Month-over-Month Changes in Search Interest"
    )
)

# Subplot 1: Raw trends
fig.add_trace(
    go.Scatter(
        x=trends_data['Month'], y=trends_data['diabetes_trend'],
        mode='lines+markers', name='Diabetes', line=dict(width=2, color='pink')
    ),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(
        x=trends_data['Month'], y=trends_data['malaria_trend'],
        mode='lines+markers', name='Malaria', line=dict(width=2, color='goldenrod')
    ),
    row=1, col=1
)

# Subplot 2: Trend changes
fig.add_trace(
    go.Scatter(
        x=trends_data['Month'], y=trends_data['diabetes_change'],
        mode='lines+markers', name='Diabetes Change', line=dict(dash='dot', color='hotpink'), opacity=0.7
    ),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(
        x=trends_data['Month'], y=trends_data['malaria_change'],
        mode='lines+markers', name='Malaria Change', line=dict(dash='dot', color='olive'), opacity=0.7
    ),
    row=2, col=1
)
# Add horizontal line at y=0 in subplot 2
fig.add_hline(y=0, line_dash="dash", line_color="black", row=2, col=1, opacity=0.5)

# Update layout
fig.update_layout(
    height=700,
    title_text="Interactive Disease Awareness Trends and Changes",
    showlegend=True
)
fig.update_xaxes(title_text="Time Period", row=2, col=1, tickformat='%Y-%m', tickangle=45)
fig.update_yaxes(title_text="Search Interest", row=1, col=1)
fig.update_yaxes(title_text="Change in Search Interest", row=2, col=1)

fig.show()

### B. Comparative Analysis
Understanding relative public interest in different diseases.

In [18]:
# # Calculate relative interest
# trends_data['relative_interest'] = trends_data['diabetes_trend'] / trends_data['malaria_trend']

# plt.figure(figsize=(10, 6))
# plt.plot(trends_data['Month'], trends_data['relative_interest'], 
#          marker='o', linewidth=2, color='purple')
# plt.axhline(y=1, color='red', linestyle='--', alpha=0.7, label='Equal Interest')
# plt.title('Relative Public Interest: Diabetes vs Malaria')
# plt.ylabel('Diabetes Interest / Malaria Interest')
# plt.xlabel('Time Period')
# plt.legend()
# plt.grid(True, alpha=0.3)
# plt.tight_layout()
# plt.show()

# print(f"\nAverage relative interest: {trends_data['relative_interest'].mean():.2f}")
# print(f"Diabetes is on average {trends_data['relative_interest'].mean():.1f}x more searched than malaria")

import plotly.graph_objects as go

# Calculate relative interest if not already done
if 'relative_interest' not in trends_data.columns:
    trends_data['relative_interest'] = trends_data['diabetes_trend'] / trends_data['malaria_trend']

fig = go.Figure()

# Add the relative interest line
fig.add_trace(go.Scatter(
    x=trends_data['Month'],
    y=trends_data['relative_interest'],
    mode='lines+markers',
    name='Relative Interest (Diabetes/Malaria)',
    line=dict(color='purple', width=2)
))

# Add the horizontal line at y=1
fig.add_hline(
    y=1,
    line_dash="dash",
    line_color="red",
    opacity=0.7,
    annotation_text="Equal Interest",
    annotation_position="top left"
)

fig.update_layout(
    title='Relative Public Interest: Diabetes vs Malaria',
    xaxis_title='Time Period',
    yaxis_title='Diabetes Interest / Malaria Interest',
    xaxis=dict(tickformat='%Y-%m', tickangle=45),
    height=500,
    showlegend=True,
    template='plotly_white'
)

fig.show()

# Print summary statistics
print(f"\nAverage relative interest: {trends_data['relative_interest'].mean():.2f}")
print(f"Diabetes is on average {trends_data['relative_interest'].mean():.1f}x more searched than malaria")


Average relative interest: 7.31
Diabetes is on average 7.3x more searched than malaria


## 5. Recommendations for Using Google Trends Data

### Key Benefits:
1. **Early Warning System**: Detect increased public concern before official reports
2. **Campaign Monitoring**: Track effectiveness of public health campaigns
3. **Resource Planning**: Anticipate healthcare demand based on search patterns
4. **Comparative Analysis**: Understand relative disease awareness
5. **Seasonal Planning**: Identify peak awareness periods for targeted interventions

### Implementation Strategy:
1. **Real-time Monitoring**: Set up automated trend tracking
2. **Alert System**: Create thresholds for significant changes
3. **Integration**: Combine with traditional health data sources
4. **Validation**: Correlate trends with actual health outcomes
5. **Action Planning**: Use insights to guide public health interventions

In [19]:
# Summary statistics and insights
print("=== GOOGLE TRENDS DATA INSIGHTS ===\n")

print("1. TREND SUMMARY:")
print(f"   - Diabetes search interest: {trends_data['diabetes_trend'].mean():.1f} (avg), {trends_data['diabetes_trend'].std():.1f} (std)")
print(f"   - Malaria search interest: {trends_data['malaria_trend'].mean():.1f} (avg), {trends_data['malaria_trend'].std():.1f} (std)")

print("\n2. SEASONAL PATTERNS:")
print(f"   - Highest diabetes interest: Q{quarterly_avg['diabetes_trend'].idxmax()} ({quarterly_avg['diabetes_trend'].max():.1f})")
print(f"   - Highest malaria interest: Q{quarterly_avg['malaria_trend'].idxmax()} ({quarterly_avg['malaria_trend'].max():.1f})")

print("\n3. PREDICTIVE CAPABILITY:")
print(f"   - Diabetes trend predictability: {diabetes_r2:.1%}")
print(f"   - Malaria trend predictability: {malaria_r2:.1%}")

print("\n4. PUBLIC HEALTH IMPLICATIONS:")
print(f"   - Diabetes receives {trends_data['relative_interest'].mean():.1f}x more search attention than malaria")
print(f"   - {len(diabetes_spikes)} significant diabetes awareness spikes detected")
print(f"   - {len(malaria_spikes)} significant malaria awareness spikes detected")

=== GOOGLE TRENDS DATA INSIGHTS ===

1. TREND SUMMARY:
   - Diabetes search interest: 88.1 (avg), 1.9 (std)
   - Malaria search interest: 12.1 (avg), 1.0 (std)

2. SEASONAL PATTERNS:
   - Highest diabetes interest: Q2 (89.0)
   - Highest malaria interest: Q2 (12.9)

3. PREDICTIVE CAPABILITY:
   - Diabetes trend predictability: 2.3%
   - Malaria trend predictability: 0.5%

4. PUBLIC HEALTH IMPLICATIONS:
   - Diabetes receives 7.3x more search attention than malaria
   - 2 significant diabetes awareness spikes detected
   - 2 significant malaria awareness spikes detected
