## 1. Imports and Configuration

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from pathlib import Path

# Style configuration
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['font.size'] = 10

print("‚úì Libraries successfully imported (including Plotly)")
print("  - pandas, numpy, matplotlib")
print("  - plotly.graph_objects for interactive charts")

‚úì Libraries successfully imported (including Plotly)
  - pandas, numpy, matplotlib
  - plotly.graph_objects for interactive charts


## 2. Loading and Parsing CSV Data

In [3]:
# File path
data_path = Path('../../data/usage/dataworld_bank_dataset.csv')
data_world_population = Path('../../data/usage/dataworld_bank_world_population.csv')

# Load CSV skipping metadata (first 4 rows)
df = pd.read_csv(data_path, skiprows=4)

print(f"‚úì File loaded: {data_path}")
print(f"\nDataFrame structure:")
print(f"  Shape: {df.shape}")
print(f"  Columns: {df.columns.tolist()[:10]}...")
print(f"\nFirst row:")
print(df.head(1))

# Extract the 'World' row
world_data = df[df['Country Name'] == 'World'].iloc[0]
print(f"\n‚úì 'World' data extracted")
print(f"  Indicator: {world_data['Indicator Name']}")
print(f"  Code: {world_data['Indicator Code']}")


#extract world population data
df1 = pd.read_csv(data_world_population)
world_population = df1[df1['Country Name'] == 'World'].iloc[0]
print(f"  Indicator: {world_population['Indicator Name']}")
print(f"  Code: {world_population['Indicator Code']}")

‚úì File loaded: ..\..\data\usage\dataworld_bank_dataset.csv

DataFrame structure:
  Shape: (266, 70)
  Columns: ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965']...

First row:
  Country Name Country Code                                    Indicator Name  \
0        Aruba          ABW  Individuals using the Internet (% of population)   

   Indicator Code  1960  1961  1962  1963  1964  1965  ...     2016   2017  \
0  IT.NET.USER.ZS   NaN   NaN   NaN   NaN   NaN   NaN  ...  93.5425  97.17   

   2018  2019  2020  2021  2022  2023  2024  Unnamed: 69  
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN          NaN  

[1 rows x 70 columns]

‚úì 'World' data extracted
  Indicator: Individuals using the Internet (% of population)
  Code: IT.NET.USER.ZS
  Indicator: Population, total
  Code: SP.POP.TOTL


## 3. Definition of Global Population Parameter

In [4]:
# CONFIGURABLE PARAMETER: Global population in billions
world_population.head(n=10)

Country Name                  World
Country Code                    WLD
Indicator Name    Population, total
Indicator Code          SP.POP.TOTL
1960                     3021512598
1961                     3062768116
1962                     3117372187
1963                     3184063049
1964                     3251253200
1965                     3318997522
Name: 0, dtype: object

## 4. Calculation of Internet Users in Millions

In [5]:
# Identify year columns (numeric)
year_columns = [col for col in df.columns 
                if col not in ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']]

# Convert to sorted years
year_columns = sorted([int(year) for year in year_columns if year.isdigit()])
year_columns = [str(year) for year in year_columns]

print(f"Available years: {year_columns[0]} to {year_columns[-1]}")
print(f"Number of years: {len(year_columns)}")

# Extract percentages and convert to millions of users
percentages = []
users_millions = []
populations_millions = []

for year in year_columns:
    value = world_data[year]
    year_world_population = world_population[year]
    # Convert to number, ignore empty or invalid values
    try:
        percentage = float(value)
        # Population from CSV is in individuals, convert to millions
        population_in_millions = float(year_world_population) / 1_000_000
        # Formula: (percentage / 100) √ó population (in millions) = millions of users
        million_users = (percentage / 100) * population_in_millions
        percentages.append(percentage)
        users_millions.append(million_users)
        populations_millions.append(population_in_millions)
    except (ValueError, TypeError):
        percentages.append(np.nan)
        users_millions.append(np.nan)
        populations_millions.append(np.nan)

# Create a DataFrame for results
results_df = pd.DataFrame({
    'Year': year_columns,
    'Percentage (%)': percentages,
    'Population (millions)': populations_millions,
    'Users (millions)': users_millions
})

# Remove NaN values
results_df = results_df.dropna()

print(f"\n‚úì Data converted")
print(f"\nData overview (first and last points):")
print(pd.concat([results_df.head(3), results_df.tail(3)]).to_string(index=False))
print(f"\nSummary:")
print(f"  Min: {results_df['Users (millions)'].min():.1f} million ({results_df['Percentage (%)'].min():.2f}%)")
print(f"  Max: {results_df['Users (millions)'].max():.1f} million ({results_df['Percentage (%)'].max():.2f}%)")

Available years: 1960 to 2024
Number of years: 65

‚úì Data converted

Data overview (first and last points):
Year  Percentage (%)  Population (millions)  Users (millions)
2005            15.6            6575.841506       1025.831275
2006            17.2            6659.977025       1145.516048
2007            20.2            6744.489399       1362.386859
2022            63.7            7990.399768       5089.884652
2023            65.4            8064.976601       5274.494697
2024            67.6            8142.056446       5504.030157

Summary:
  Min: 1025.8 million (15.60%)
  Max: 5504.0 million (67.60%)


## 4b. Loading Statistica Dataset for Comparison

In [6]:
# Load Statistica dataset
statistica_path = Path('../../data/usage/statistica_dataset.csv')
df_statistica = pd.read_csv(statistica_path, sep=';')

print(f"‚úì Statistica dataset loaded: {statistica_path}")
print(f"  Shape: {df_statistica.shape}")
print(f"  Columns: {df_statistica.columns.tolist()}")
print(f"\nFirst rows:")
print(df_statistica.head())

# Extract and convert data
statistica_years = df_statistica['Date'].astype(int).values
statistica_users = df_statistica['nb_milions'].values

print(f"\n‚úì Statistica data extracted")
print(f"  Years: {statistica_years[0]} to {statistica_years[-1]}")
print(f"  Users: {statistica_users.min():.0f}M to {statistica_users.max():.0f}M")

‚úì Statistica dataset loaded: ..\..\data\usage\statistica_dataset.csv
  Shape: (20, 2)
  Columns: ['Date', 'nb_milions']

First rows:
   Date  nb_milions
0  2005        1023
1  2006        1147
2  2007        1367
3  2008        1545
4  2009        1727

‚úì Statistica data extracted
  Years: 2005 to 2024
  Users: 1023M to 5500M


## 5. Visualization with Annotations

In [7]:
gen_gap = True


# Create Plotly figure - Professional and interactive design with comparison
years_int = results_df['Year'].astype(int).values
users_values = results_df['Users (millions)'].values
percentages_values = results_df['Percentage (%)'].values

# Filter to exclude 2004 and 2025 (if present) - keep only 2005-2024
mask = (years_int >= 2005) & (years_int <= 2024)
years_int_filtered = years_int[mask]
users_values_filtered = users_values[mask]
percentages_values_filtered = percentages_values[mask]

# Create custom hover texts - Dataset 1
hover_texts_1 = []
for year, users, pct in zip(years_int_filtered, users_values_filtered, percentages_values_filtered):
    hover_text = f"<b>Year: {year}</b><br><b>Dataset 1 (World Bank)</b><br>Users: {users:,.0f}<br>Percentage: {pct:.2f}%"
    hover_texts_1.append(hover_text)

# Create Plotly figure
fig = go.Figure()

# Add Dataset 1 curve (World Bank)
fig.add_trace(go.Scatter(
    x=years_int_filtered,
    y=users_values_filtered,
    mode='lines+markers+text',
    name='Dataset 1 (World Bank)',
    line=dict(
        color='#1f77b4',
        width=4
    ),
    marker=dict(
        size=11,
        color='#1f77b4',
        line=dict(color='white', width=2)
    ),
    text=[f'{users:,.0f}' for users in users_values_filtered],
    textposition='top center',
    textfont=dict(size=11, color='#1f77b4', family='Arial Black'),
    hovertext=hover_texts_1,
    hoverinfo='skip',
    fill=None
))

# Add Dataset 2 curve (Statistica)
statistica_years_filtered = statistica_years[(statistica_years >= 2005) & (statistica_years <= 2024)]
statistica_users_filtered = statistica_users[(statistica_years >= 2005) & (statistica_years <= 2024)]

hover_texts_2 = []
for year, users in zip(statistica_years_filtered, statistica_users_filtered):
    hover_text = f"<b>Year: {year}</b><br><b>Dataset 2 (Statistica)</b><br>Users: {users:,.0f}"
    hover_texts_2.append(hover_text)

fig.add_trace(go.Scatter(
    x=statistica_years_filtered,
    y=statistica_users_filtered,
    mode='lines+markers+text',
    name='Dataset 2 (Statistica)',
    line=dict(
        color='#ff7f0e',
        width=4,
        dash='dash'
    ),
    marker=dict(
        size=11,
        color='#ff7f0e',
        symbol='diamond',
        line=dict(color='white', width=2)
    ),
    text=[f'{users:,.0f}' for users in statistica_users_filtered],
    textposition='bottom center',
    textfont=dict(size=11, color='#ff7f0e', family='Arial Black'),
    hovertext=hover_texts_2,
    hoverinfo='skip',
    fill=None
))

# Calculate average gap between the two curves (on common years)
common_years_list = sorted(list(set(years_int_filtered) & set(statistica_years_filtered)))
gaps_by_year = {}
gaps = []

for year in common_years_list:
    idx1 = np.where(years_int_filtered == year)[0][0]
    idx2 = np.where(statistica_years_filtered == year)[0][0]
    val1 = users_values_filtered[idx1]
    val2 = statistica_users_filtered[idx2]
    gap_pct = abs((val1 - val2) / val2) * 100
    gaps_by_year[year] = {
        'val1': val1,
        'val2': val2,
        'gap_pct': gap_pct
    }
    gaps.append(gap_pct)

average_gap = np.mean(gaps) if gaps else 0
max_gap = np.max(gaps) if gaps else 0

# Add annotations for gaps (lines and labels)
annotations_list = []

for year in common_years_list:
    data = gaps_by_year[year]
    val1 = data['val1']
    val2 = data['val2']
    gap_pct = data['gap_pct']
    
    # Determine position of vertical line
    y_min = min(val1, val2)
    y_max = max(val1, val2)
    y_mid = (y_min + y_max) / 2
    
    # Add vertical line connecting the two curves
    fig.add_shape(
        type='line',
        x0=year, y0=y_min,
        x1=year, y1=y_max,
        line=dict(color='rgba(200, 100, 100, 0.5)', width=2, dash='dot')
    )
    
    if gen_gap:
        # Add gap percentage annotation
        annotations_list.append(
            dict(
                x=year,
                y=y_mid,
                text=f'<b>{gap_pct:.1f}%</b>',
                showarrow=False,
                xanchor='center',
                yanchor='middle',
                font=dict(size=9, color='#c86464', family='Arial Black'),
                bgcolor='rgba(255, 255, 255, 0.8)',
                borderpad=2
            )
    )

# Professional layout - CORRECTED for Plotly 5.x
fig.update_layout(
    title={
        'text': f'<b>Comparison: Global Internet Users Growth (in millions) 2005-2024</b><br><sub>Average gap between sources: {average_gap:.1f}% | Maximum gap: {max_gap:.1f}%</sub>',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 22, 'color': '#222222', 'family': 'Arial'}
    },
    xaxis=dict(
        title=dict(text='<b>Year</b>', font=dict(size=18, color='#333333', family='Arial')),
        tickfont=dict(size=14, color='#333333'),
        showgrid=True,
        gridwidth=1,
        gridcolor='#e8e8e8',
        zeroline=False,
        dtick=1,
        showline=True,
        linewidth=2,
        linecolor='#cccccc',
        range=[2004.5, 2024.5]
    ),
    yaxis=dict(
        title=dict(text='<b>Internet Users (Millions)</b>', font=dict(size=18, color='#333333', family='Arial')),
        tickfont=dict(size=14, color='#333333'),
        showgrid=True,
        gridwidth=1,
        gridcolor='#e8e8e8',
        zeroline=False,
        showline=True,
        linewidth=2,
        linecolor='#cccccc',
        tickformat=','
    ),
    plot_bgcolor='#ffffff',
    paper_bgcolor='#ffffff',
    hovermode='closest',
    showlegend=True,
    legend=dict(
        x=0.02,
        y=0.98,
        bgcolor='rgba(255, 255, 255, 0.95)',
        bordercolor='#cccccc',
        borderwidth=2,
        font=dict(size=14, family='Arial')
    ),
    annotations=annotations_list,
    width=1400,
    height=1000,
    margin=dict(l=100, r=100, t=180, b=100)
)

if gen_gap:
    # Save as interactive HTML
    fig.write_html('./outputs/internet_users_comparison_withgaps_interactive.html')
    print("\n‚úì Comparison chart generated and saved: internet_users_comparison_withgaps_interactive.html")

    # Save as static image
    fig.write_image('./outputs/internet_users_comparison_withgaps.png', width=1400, height=1000, scale=2)
    print("‚úì Comparison chart generated and saved: internet_users_comparison_withgaps.png")
    
else:
    # Save as interactive HTML
    fig.write_html('./outputs/internet_users_comparison_interactive.html')
    print("\n‚úì Comparison chart generated and saved: internet_users_comparison_interactive.html")

    # Save as static image
    fig.write_image('./outputs/internet_users_comparison.png', width=1400, height=1000, scale=2)
    print("‚úì Comparison chart generated and saved: internet_users_comparison.png")

print("\nüìä COMPARATIVE RESULTS:")
print(f"  ‚úì Interactive version (HTML): internet_users_comparison_interactive.html")
print(f"  ‚úì Static version (PNG): internet_users_comparison.png")
print(f"  ‚úì Average gap between sources: {average_gap:.2f}%")
print(f"  ‚úì Maximum gap observed: {max_gap:.2f}%")
print(f"  ‚úì Common years with gap annotations: {len(common_years_list)}")
print(f"  ‚úì Year range: 2005-2024")
print(f"  ‚úì Image width: 1400px (compact format)")
print("\nüìà Gap by year:")
for year in common_years_list:
    print(f"   {year}: {gaps_by_year[year]['gap_pct']:.1f}%")
print("\n‚úÖ Comparison charts generated successfully!")

fig.show()


‚úì Comparison chart generated and saved: internet_users_comparison_withgaps_interactive.html
‚úì Comparison chart generated and saved: internet_users_comparison_withgaps.png

üìä COMPARATIVE RESULTS:
  ‚úì Interactive version (HTML): internet_users_comparison_interactive.html
  ‚úì Static version (PNG): internet_users_comparison.png
  ‚úì Average gap between sources: 0.60%
  ‚úì Maximum gap observed: 3.96%
  ‚úì Common years with gap annotations: 20
  ‚úì Year range: 2005-2024
  ‚úì Image width: 1400px (compact format)

üìà Gap by year:
   2005: 0.3%
   2006: 0.1%
   2007: 0.3%
   2008: 0.8%
   2009: 1.3%
   2010: 0.4%
   2011: 0.7%
   2012: 0.1%
   2013: 0.1%
   2014: 0.0%
   2015: 0.3%
   2016: 0.2%
   2017: 0.1%
   2018: 0.1%
   2019: 0.1%
   2020: 0.4%
   2021: 0.3%
   2022: 4.0%
   2023: 2.3%
   2024: 0.1%

‚úÖ Comparison charts generated successfully!


## 6. Statistical Summary

In [8]:
print("="*70)
print("STATISTICAL SUMMARY - GLOBAL INTERNET USERS GROWTH")
print("="*70)

# Key years
first_year = results_df.iloc[0]
last_year = results_df.iloc[-1]
max_growth = results_df.loc[results_df['Users (millions)'].idxmax()]

print(f"\nüìä ANALYSIS PERIOD: {first_year['Year']} - {last_year['Year']}")
print(f"   Global population used: Dynamic (from World Bank data)")

print(f"\n{first_year['Year']} (Beginning):")
print(f"   Population: {first_year['Population (millions)']:,.0f} million")
print(f"   Percentage: {first_year['Percentage (%)']:.2f}%")
print(f"   Users: {first_year['Users (millions)']:,.1f} million")

print(f"\n{last_year['Year']} (End):")
print(f"   Population: {last_year['Population (millions)']:,.0f} million")
print(f"   Percentage: {last_year['Percentage (%)']:.2f}%")
print(f"   Users: {last_year['Users (millions)']:,.1f} million")

absolute_growth = last_year['Users (millions)'] - first_year['Users (millions)']
relative_growth = ((last_year['Users (millions)'] / first_year['Users (millions)']) - 1) * 100

print(f"\nüìà GROWTH ({first_year['Year']}-{last_year['Year']}):")
print(f"   Absolute increase: {absolute_growth:,.1f} million users")
print(f"   Relative increase: {relative_growth:.1f}%")
print(f"   Percentage increase: {last_year['Percentage (%)'] - first_year['Percentage (%)']:.2f} points")

print(f"\nüéØ RECORD:")
print(f"   Year: {int(max_growth['Year'])}")
print(f"   Maximum: {max_growth['Users (millions)']:,.1f} million ({max_growth['Percentage (%)']:.2f}%)")
print(f"   Population that year: {max_growth['Population (millions)']:,.0f} million")

# Average growth rate per year
annual_growth_rate = (last_year['Users (millions)'] - first_year['Users (millions)']) / len(results_df)
print(f"\n‚è±Ô∏è  AVERAGE GROWTH RATE:")
print(f"   +{annual_growth_rate:,.1f} million users per year")

print("\n" + "="*70)

STATISTICAL SUMMARY - GLOBAL INTERNET USERS GROWTH

üìä ANALYSIS PERIOD: 2005 - 2024
   Global population used: Dynamic (from World Bank data)

2005 (Beginning):
   Population: 6,576 million
   Percentage: 15.60%
   Users: 1,025.8 million

2024 (End):
   Population: 8,142 million
   Percentage: 67.60%
   Users: 5,504.0 million

üìà GROWTH (2005-2024):
   Absolute increase: 4,478.2 million users
   Relative increase: 436.5%
   Percentage increase: 52.00 points

üéØ RECORD:
   Year: 2024
   Maximum: 5,504.0 million (67.60%)
   Population that year: 8,142 million

‚è±Ô∏è  AVERAGE GROWTH RATE:
   +223.9 million users per year

