### 1. Importing necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

### 2. Load the data

In [2]:
#If loading the dataset remotely
url = "https://data.un.org/_Docs/SYB/CSV/SYB66_314_202310_Internet%20Usage.csv"
internet_usage = pd.read_csv(url, skiprows=1, encoding='latin1')

#If loading the dataset locally
#internet_usage = pd.read_csv("internet_usage.csv", skiprows=1, encoding='latin1')
internet_usage.head()

Unnamed: 0,Region/Country/Area,Unnamed: 1,Year,Series,Value,Footnotes,Source
0,1,"Total, all countries or areas",2000,Percentage of individuals using the internet,5.3,,"International Telecommunication Union (ITU), G..."
1,1,"Total, all countries or areas",2005,Percentage of individuals using the internet,15.6,,"International Telecommunication Union (ITU), G..."
2,1,"Total, all countries or areas",2010,Percentage of individuals using the internet,28.5,,"International Telecommunication Union (ITU), G..."
3,1,"Total, all countries or areas",2015,Percentage of individuals using the internet,40.0,,"International Telecommunication Union (ITU), G..."
4,1,"Total, all countries or areas",2019,Percentage of individuals using the internet,53.7,,"International Telecommunication Union (ITU), G..."


In [3]:
internet_usage = internet_usage.drop(["Series", "Footnotes", "Source", "Region/Country/Area"], axis = 1)
internet_usage.rename(columns = {"Unnamed: 1": "Region_Country_Area", "Value" : "internet_usage(%)"}, inplace = True)
internet_usage.head()

Unnamed: 0,Region_Country_Area,Year,internet_usage(%)
0,"Total, all countries or areas",2000,5.3
1,"Total, all countries or areas",2005,15.6
2,"Total, all countries or areas",2010,28.5
3,"Total, all countries or areas",2015,40.0
4,"Total, all countries or areas",2019,53.7


In [4]:
#Splitting the dataset into 3 parts: 	Region, Country and Area

collective_internet_usage = pd.DataFrame(internet_usage.iloc[:139])
# display(collective_internet_usage.head(10))
country_internet_usage = pd.DataFrame(internet_usage.iloc[139:])
# ldc_internet_usage = pd.DataFrame(internet_usage.iloc[1521:])

# Displaying  the shape of each dataframe
print("Regional DataFrame's Shape: ",collective_internet_usage.shape)
print("\n=============================================================\n")

print("Countrywide DataFrame's Shape: ",country_internet_usage.shape)
# print("\n=============================================================\n")

# print("Area DataFrame's Shape: ",ldc_internet_usage.shape)



Regional DataFrame's Shape:  (139, 3)


Countrywide DataFrame's Shape:  (1389, 3)


In [5]:
# Checking if no row of one dataframe is in another
def check_no_common_rows(df1,df2):
    for element in df1['Region_Country_Area'].unique():
        if element in df2['Region_Country_Area'].unique():
            print("Common Element: ",element)
        else:
            return


check_no_common_rows(collective_internet_usage,country_internet_usage)


### 3. Answering the Questions

 1. Trend Analysis Over Time
   - Primary Question: How has internet usage changed globally and across regions over time?
     - Sub-questions:
       - How does internet usage in Africa compare to other continents over time?
       - What is the internet usage trend in Different African Regions?
       - Which regions show the highest and lowest internet usage growth rates?
       - Are there any notable inflection points or periods of accelerated growth in internet adoption across different regions?
       - How has global average internet usage evolved from 2000 to 2021?


In [6]:
# all regions in collective_internet_usage
regions = collective_internet_usage['Region_Country_Area'].unique()
print("Regions: ",regions)

Regions:  ['Total, all countries or areas' 'Northern Africa' 'Sub-Saharan Africa'
 'Eastern Africa' 'Middle Africa' 'Southern Africa' 'Western Africa'
 'Northern America' 'Latin America & the Caribbean' 'Caribbean'
 'Central Asia' 'Eastern Asia' 'South-central Asia' 'South-eastern Asia'
 'Southern Asia' 'Western Asia' 'Europe' 'Oceania'
 'Australia and New Zealand' 'Micronesia']


In [7]:
# How does internet usage in Africa compare to other continents over time?
african_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'].isin([
        'Sub-Saharan Africa','Northern Africa','Eastern Africa','Middle Africa','Southern Africa','Western Africa'
        ])]


north_america_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area']=='Northern America'
    ]

latin_america_caribbean_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'].isin([['Latin America & the Caribbean','Caribbean']])
    ]



asia_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'].isin([
        'Eastern Asia','South-eastern Asia','South-central Asia','Western Asia','Central Asia','Southern Asia'
        ])]

europe_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'].isin([
       'Europe'
        ])]

Oceania_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'] == 'Oceania'
    ]


australia_newzealand_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'] == 'Australia and New Zealand'
    ]

Micronesia_internet = collective_internet_usage[
    collective_internet_usage['Region_Country_Area'] == 'Micronesia'
    ]

In [8]:
# Concatenating data from all continents
continent_data = pd.concat([african_internet, north_america_internet, asia_internet, europe_internet, Oceania_internet, australia_newzealand_internet, Micronesia_internet])

# Assigning a continent label based on the Region_Country_Area
continent_data['Continent'] = continent_data['Region_Country_Area'].apply(lambda x: 'Africa' if 'Africa' in x else
                                                                           'North America' if 'Northern America' in x else
                                                                           'Asia' if any(region in x for region in ['Central Asia', 'Eastern Asia', 'South-central Asia', 'South-eastern Asia', 'Southern Asia', 'Western Asia']) else
                                                                           'Europe' if 'Europe' in x else
                                                                           'Oceania' if any(region in x for region in ['Oceania', 'Australia and New Zealand', 'Micronesia']) else
                                                                           'Other')

# Grouping data by continent and year to calculate the mean internet usage for each year
mean_internet_usage = continent_data.groupby(['Continent', 'Year'])['internet_usage(%)'].mean().reset_index()

# global average for each year
global_avg = collective_internet_usage[collective_internet_usage['Region_Country_Area'] == 'Total, all countries or areas']



# Plotting the data using Plotly
fig = px.line(
    mean_internet_usage,
    x='Year',
    y='internet_usage(%)',
    color='Continent',
    markers=True,
    title='Mean Internet Usage Over Time by Continent and Global Average'
)
fig.add_trace(
    go.Line(
        x=global_avg['Year'],
        y=global_avg['internet_usage(%)'],
        mode='lines',
        name='Global Average',
        line=dict(color='black', width=3, dash='dash')
    )
)

fig.show()



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




* **Different African Regions**

In [9]:
fig = px.line(
    african_internet,
    x='Year',
    y='internet_usage(%)',
    color='Region_Country_Area',
    markers=True,
    title='Internet Usage Over Time in Africa by Region'
)

fig.show()

2. Descriptive Statistics by Region and Country
   - Primary Question: What statistical measures describe internet usage distribution?
     - Sub-questions:
       - How do internet usage rates compare between different world regions?
       - What is the mean, median, and standard deviation of internet usage per region?
       - Which regions have the highest and lowest percentages of internet users as of the latest data?


In [10]:
continent_data

Unnamed: 0,Region_Country_Area,Year,internet_usage(%),Continent
7,Northern Africa,2000,0.6,Africa
8,Northern Africa,2005,9.7,Africa
9,Northern Africa,2010,23.2,Africa
10,Northern Africa,2015,36.6,Africa
11,Northern Africa,2019,56.0,Africa
...,...,...,...,...
134,Micronesia,2010,20.0,Oceania
135,Micronesia,2015,31.5,Oceania
136,Micronesia,2019,39.9,Oceania
137,Micronesia,2020,40.1,Oceania


In [11]:
# How do internet usage rates compare between different world regions?
px.bar(
    mean_internet_usage,
    x='internet_usage(%)',
    color='Continent',
    hover_data=['Year'],
    title='Bar chart: Internet Usage by Continent'
)

- Africa is the continent with the lowest internet usage rates, while North America has the highest rates.
- Europe is the second to North America in terms of internet usage rates.
- Oceania has the third highest internet usage rates.

In [12]:
# What is the mean, median, and standard deviation of internet usage per region?
internet_usage_stats = continent_data.groupby('Continent')['internet_usage(%)'].agg(['mean', 'median', 'std']).reset_index()
internet_usage_stats

Unnamed: 0,Continent,mean,median,std
0,Africa,23.454762,18.05,22.474161
1,Asia,33.990476,32.4,25.600113
2,Europe,71.966667,78.45,18.273223
3,North America,77.228571,76.1,14.891912
4,Oceania,56.090476,56.9,27.686638


- North America is the continent with low STD, which means that the internet usage is more consistent across the years.
- This also means that all nations In North America have a similar internet usage rate. There is no big gaps
- And its mean is the highest among all continents, which means that it has the highest internet usage rate.
#

- On the other hand, Asia is the continet with highest STD, which means that the internet usage is not equally distributed among its nations.
- Afirca has the lowest mean, which means that it has the lowest internet usage rate among all her continents.

In [13]:
# Which regions have the highest and lowest percentages of internet users as of the latest data?
latest_data = continent_data[continent_data['Year'] == 2021]
latest_data = latest_data.sort_values('internet_usage(%)', ascending=False)


latest_data_bar= px.bar(
    latest_data,
    x='internet_usage(%)',
    y='Region_Country_Area',
    color='Continent',
    title='Bar chart: Internet Usage by Region in 2021'
)

latest_data_bar.show()

### Question 3


 3. Outlier and Anomaly Detection
   - Primary Question: Are there any unusual patterns in internet usage data across countries and regions?
     - Sub-questions:
       - Which countries or regions have achieved near-universal internet access (90% and above) by 2021, and when did they reach this threshold?
       - Are there anomalies or outliers, such as countries with declining internet usage?


In [14]:
# country_internet_usage.head()

# countries with (>=90%) internet usage in 2021
high_internet_usage_2021 = country_internet_usage[(country_internet_usage['Year'] == 2021) & (country_internet_usage['internet_usage(%)'] >= 90)]
print("Number of countries with internet usage >= 90% in 2021: ", high_internet_usage_2021.shape[0])
print()
print("List of them:")
print('\n=============================================================\n')
for country in high_internet_usage_2021.sort_values('internet_usage(%)', ascending=False)['Region_Country_Area'].values:
    print(f'{country} : {high_internet_usage_2021[high_internet_usage_2021["Region_Country_Area"] == country]["internet_usage(%)"].values[0]}%')

Number of countries with internet usage >= 90% in 2021:  40

List of them:


Saudi Arabia : 100.0%
Qatar : 100.0%
Bahrain : 100.0%
United Arab Emirates : 100.0%
Kuwait : 99.7%
Iceland : 99.7%
Norway : 99.0%
Denmark : 98.9%
Luxembourg : 98.7%
Brunei Darussalam : 98.1%
Republic of Korea : 97.6%
Malaysia : 96.8%
United Kingdom : 96.7%
Oman : 96.4%
Australia : 96.2%
New Zealand : 95.9%
Antigua and Barbuda : 95.7%
Switzerland : 95.6%
Liechtenstein : 95.6%
Ireland : 95.2%
Bahamas : 94.3%
Spain : 93.9%
Andorra : 93.9%
China, Hong Kong SAR : 93.1%
Canada : 92.8%
Finland : 92.8%
Belgium : 92.8%
Austria : 92.5%
Netherlands (Kingdom of the) : 92.1%
United States of America : 91.8%
Germany : 91.4%
Latvia : 91.2%
Singapore : 91.1%
Estonia : 91.0%
Kazakhstan : 90.9%
Cyprus : 90.8%
Israel : 90.3%
Chile : 90.2%
Other non-specified areas : 90.1%
Uruguay : 90.1%


In [15]:

# Plotting the data using Plotly
fig_high_internet_2021 = px.choropleth(
    high_internet_usage_2021,
    locations='Region_Country_Area',
    locationmode='country names',
    color='internet_usage(%)',
    title='Countries with >=90% Internet Usage in 2021',
    width=1050, height=800
)

fig_high_internet_2021.show()

* **- Are there anomalies or outliers, such as countries with declining internet usage?**

In [16]:

def decline_in_internet(df):
    result_list = []
    
    # Iterate over each unique country
    for country in df['Region_Country_Area'].unique():
        # Filter the data for that specific country and sort by year
        country_data = df[df['Region_Country_Area'] == country].sort_values(by='Year').reset_index(drop=True)
        
        # Iterate over the rows and compare the current year with the previous year
        for i in range(1, len(country_data)):
            if country_data['internet_usage(%)'].iloc[i] < country_data['internet_usage(%)'].iloc[i - 1]:
                # Append both records (the high and the low)
                result_list.append({
                    'Region_Country_Area': country,
                    'Year_High': country_data['Year'].iloc[i - 1],
                    'internet_usage_High(%)': country_data['internet_usage(%)'].iloc[i - 1],
                    'Year_Low': country_data['Year'].iloc[i],
                    'internet_usage_Low(%)': country_data['internet_usage(%)'].iloc[i]
                })
                # Break to ensure only one instance of a decrease is added per country
                break

    # Convert result to a DataFrame
    return pd.DataFrame(result_list)

# Assuming 'country_internet_usage' is the DataFrame with your data
decline_df = decline_in_internet(country_internet_usage)

# Display the result
# 
decline_df


Unnamed: 0,Region_Country_Area,Year_High,internet_usage_High(%),Year_Low,internet_usage_Low(%)
0,Australia,2020,96.4,2021,96.2
1,Austria,2019,87.8,2020,87.5
2,Barbados,2019,83.2,2020,82.5
3,Brazil,2020,81.3,2021,80.7
4,Costa Rica,2019,81.2,2020,80.5
5,Croatia,2019,79.1,2020,78.3
6,Denmark,2019,98.0,2020,96.5
7,Estonia,2019,90.2,2020,89.1
8,Finland,2010,86.9,2015,86.4
9,Japan,2019,92.7,2020,90.2


In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create a color mapping for each unique country
unique_countries = decline_df['Region_Country_Area'].unique()
color_mapping = {country: f'rgba({hash(country) % 255}, {(hash(country) // 2) % 255}, {(hash(country) // 3) % 255}, 0.8)' for country in unique_countries}

# Apply the color mapping to the DataFrame
decline_df['Color'] = decline_df['Region_Country_Area'].map(color_mapping)

# Create subplots with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=("Year High (Left)", "Year Low (Right)"))

# Add the bar plot for high internet usage
fig.add_trace(
    go.Bar(
        x=decline_df['Year_High'],
        y=decline_df['internet_usage_High(%)'],
        name='High Internet Usage',
        marker_color=decline_df['Color'],  # Assign color based on the mapping
        text=decline_df['Region_Country_Area'],
        hovertemplate='Country: %{text}<br>Year: %{x}<br>Internet Usage: %{y}%',
    ),
    row=1, col=1
)

# Add the bar plot for low internet usage
fig.add_trace(
    go.Bar(
        x=decline_df['Year_Low'],
        y=decline_df['internet_usage_Low(%)'],
        name='Low Internet Usage',
        marker_color=decline_df['Color'],  # Assign color based on the mapping
        text=decline_df['Region_Country_Area'],
        hovertemplate='Country: %{text}<br>Year: %{x}<br>Internet Usage: %{y}%',
    ),
    row=1, col=2
)

# Update layout for a shared color legend
fig.update_layout(
    title_text="Shift from High to Low Internet Usage by Country",
    showlegend=False,  # To avoid duplicate legends
    xaxis_title="Year",
    yaxis_title="Internet Usage (%)"
)
fig.update_yaxes(showticklabels=False) # Hide the y-axis labels

# Show the plot
fig.show()


*Possible causes of the shifts from 2019 to 2020*

- Probably due to COVID-19 economic disruptions, people were not able to pay for internet services due to job losses
- Governments and organizations may have shifted in priorities (to healthcare and emergencies)
- The increased demand for internet usage during lockdowns could have led to strained and overwhelmed networks, particularly in regions with less robust infrastructure

- The reported data for some regions might have been based on estimates rather than accurate measures, and those estimates could have varied significantly between 2019 and 2020


## Question 4

 4. Impact Analysis of COVID-19 on Internet Usage
   - Primary Question: How did the COVID-19 pandemic affect global internet usage?
     - Sub-questions:
       - What was the change in global internet usage rates between 2019 and 2021?
       - Are there any regions or countries where internet adoption accelerated due to the pandemic?


**What was the change in global internet usage rates between 2019 and 2021?**

In [18]:
collective_internet_usage.head(10)

global_rate_2019 = collective_internet_usage[(collective_internet_usage['Year'] == 2019) 
                                             & (collective_internet_usage['Region_Country_Area'] == 'Total, all countries or areas')]['internet_usage(%)'].values[0]



global_rate_2021 = collective_internet_usage[(collective_internet_usage['Year'] == 2021)
                                             & (collective_internet_usage['Region_Country_Area'] == 'Total, all countries or areas')]['internet_usage(%)'].values[0]



percentage_change = ((global_rate_2021 - global_rate_2019) / global_rate_2019) * 100

print(f'Global Internet Usage Rate in 2019: {global_rate_2019}%')
print(f'Global Internet Usage Rate in 2021: {global_rate_2021}%')
print(f'Percentage Change: {percentage_change:.2f}%')

Global Internet Usage Rate in 2019: 53.7%
Global Internet Usage Rate in 2021: 62.6%
Percentage Change: 16.57%


* **

**Possible reasons for this substantial change (2019 to 2021)**


- The COVID-19 pandemic in 2020 forced most of the businesses, education to shift toward the remote work/online education.
- People were stayign at home which led to use of online services
- Expansion of 5G networks
- Many countries could have launched startups to boost internet connectivity.
- Most of university students shifted to E-learning platforms.
- Increase in Online Transactions due to Lockdowns.

* **

**Are there any regions or countries where internet adoption accelerated due to the pandemic?**

Yes! , As it can be seen from above substantial global shift, countries have exprienced a big shift of 16.57% globally.

In [19]:
# Filter out groups that do not have both years
filtered_data = country_internet_usage[country_internet_usage['Year'].isin([2019, 2021])]
filtered_data = filtered_data.groupby('Region_Country_Area').filter(lambda x: len(x) == 2) # to ensure that only countries with data for both 2019 and 2021 are kept in the DataFrame

positive_increase_19_21 = filtered_data.groupby('Region_Country_Area')['internet_usage(%)'].apply(lambda x: x.iloc[1] > x.iloc[0]).sum()
decrease_19_21 = filtered_data.groupby('Region_Country_Area')['internet_usage(%)'].apply(lambda x: x.iloc[1] < x.iloc[0]).sum()

print(f'Number of countries with a positive increase in internet usage from 2019 to 2021: {positive_increase_19_21}')
print(f'Number of countries with a decrease internet usage from 2019 to 2021: {decrease_19_21}')


# Getting countries with negative increase in internet usage from 2019 to 2021
countries_with_decrease = filtered_data.groupby('Region_Country_Area')['internet_usage(%)'].apply(lambda x: x.iloc[1] < x.iloc[0])
print("Countries with decrease internet usage from 2019 to 2021:")
print(countries_with_decrease[countries_with_decrease].index.tolist())


Number of countries with a positive increase in internet usage from 2019 to 2021: 174
Number of countries with a decrease internet usage from 2019 to 2021: 4
Countries with decrease internet usage from 2019 to 2021:
['Japan', 'Netherlands (Kingdom of the)', 'Saint Kitts and Nevis', 'Sweden']


* **

* The exption for these four countries could be probably due to:
    - Economic factors (decline) due to COVID-19
    - Infrastructure limitations due to high congestions
    - Inacurracies in recorded data.

## Question 5

 5. Regional Growth Analysis and Forecasting
   - Primary Question: How has internet penetration evolved regionally, and what is the future forecast?
     - Sub-questions:
       - What are the fastest-growing regions in terms of internet adoption over the recorded ten years?
       - What is the expected internet penetration per Region by the end of 2024?


In [29]:

def get_fastest_countries(df):
    """
    Identifies countries with the fastest growth in internet adoption.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns 'Region_Country_Area', 'Year', 'internet_usage(%)'
    
    Returns:
    list: Top 10 fastest-growing countries with their growth percentages
    """
    # First, let's check for and handle any missing values
    df = df.copy()
    
    # Ensure the data is sorted by Year for each country
    df = df.sort_values(['Region_Country_Area', 'Year'])
    
    # Get the number of unique years in the dataset
    n_years = df['Year'].nunique()
    
    # Filter for countries that have data for all years
    complete_data = df.groupby('Region_Country_Area').filter(
        lambda x: len(x) == n_years and x['internet_usage(%)'].notna().all()
    )
    
    if complete_data.empty:
        print("No countries with complete data found!")
        return []
    
    # Calculate the difference between last and first year
    results = []
    for country in complete_data['Region_Country_Area'].unique():
        country_data = complete_data[complete_data['Region_Country_Area'] == country]
        first_value = country_data['internet_usage(%)'].iloc[0]
        last_value = country_data['internet_usage(%)'].iloc[-1]
        
        if pd.notna(first_value) and pd.notna(last_value):
            difference = last_value - first_value
            results.append({
                'country': country,
                'growth': difference,
                'first_year': country_data['Year'].iloc[0],
                'last_year': country_data['Year'].iloc[-1],
                'start_value': first_value,
                'end_value': last_value
            })
    
    # Sort results by growth and get top 10
    results = sorted(results, key=lambda x: x['growth'], reverse=True)[:10]
    
    # Print detailed results
    for r in results:
        print(f"{r['country']}: {r['growth']:.2f}% increase "
              f"({r['first_year']} to {r['last_year']}: "
              f"{r['start_value']:.1f}% → {r['end_value']:.1f}%)")
    
    return results

# Example usage:
get_fastest_countries(country_internet_usage)

Saudi Arabia: 97.80% increase (2000 to 2021: 2.2% → 100.0%)
Qatar: 95.10% increase (2000 to 2021: 4.9% → 100.0%)
Bahrain: 93.80% increase (2000 to 2021: 6.2% → 100.0%)
Kuwait: 93.00% increase (2000 to 2021: 6.7% → 99.7%)
Oman: 92.90% increase (2000 to 2021: 3.5% → 96.4%)
Kazakhstan: 90.20% increase (2000 to 2021: 0.7% → 90.9%)
Antigua and Barbuda: 89.20% increase (2000 to 2021: 6.5% → 95.7%)
Brunei Darussalam: 89.10% increase (2000 to 2021: 9.0% → 98.1%)
Morocco: 87.40% increase (2000 to 2021: 0.7% → 88.1%)
Bahamas: 86.30% increase (2000 to 2021: 8.0% → 94.3%)


[{'country': 'Saudi Arabia',
  'growth': 97.8,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 2.2,
  'end_value': 100.0},
 {'country': 'Qatar',
  'growth': 95.1,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 4.9,
  'end_value': 100.0},
 {'country': 'Bahrain',
  'growth': 93.8,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 6.2,
  'end_value': 100.0},
 {'country': 'Kuwait',
  'growth': 93.0,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 6.7,
  'end_value': 99.7},
 {'country': 'Oman',
  'growth': 92.9,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 3.5,
  'end_value': 96.4},
 {'country': 'Kazakhstan',
  'growth': 90.2,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 0.7,
  'end_value': 90.9},
 {'country': 'Antigua and Barbuda',
  'growth': 89.2,
  'first_year': 2000,
  'last_year': 2021,
  'start_value': 6.5,
  'end_value': 95.7},
 {'country': 'Brunei Darussalam',
  'growth': 89.1,
  'first_year': 2000,
  'las

In [34]:
def plot_internet_usage_comparison(df):
    """
    Creates a horizontal bar chart comparing internet usage between first and last year for each country,
    sorted by total values (highest at bottom).
    """
    # Get the results from our previous function
    results = get_fastest_countries(df)
    
    # Sort results by end_value (total internet usage)
    results = sorted(results, key=lambda x: x['end_value'])
    
    # Create lists for plotting
    countries = [r['country'] for r in results]
    start_values = [r['start_value'] for r in results]
    growth_values = [r['growth'] for r in results]
    end_values = [r['end_value'] for r in results]
    
    # Create hover texts
    hover_text_start = [
        f"Initial Year ({r['first_year']}): {r['start_value']:.1f}%"
        for r in results
    ]
    hover_text_growth = [
        f"Growth: +{r['growth']:.1f}%<br>"
        f"Final Year ({r['last_year']}): {r['end_value']:.1f}%"
        for r in results
    ]
    
    # Create the figure
    fig = go.Figure()
    
    # Add initial value bars
    fig.add_trace(go.Bar(
        y=countries,
        x=start_values,
        orientation='h',
        name='Initial Usage',
        text=[f"{v:.1f}%" for v in start_values],
        textposition='auto',
        hovertext=hover_text_start,
        marker_color='rgb(158,202,225)',
        opacity=0.8
    ))
    
    # Add growth bars
    fig.add_trace(go.Bar(
        y=countries,
        x=growth_values,
        orientation='h',
        name='Growth',
        text=[f"+{v:.1f}%" for v in growth_values],
        textposition='auto',
        hovertext=hover_text_growth,
        marker_color='rgb(26,118,255)',
        opacity=0.8
    ))
    
    # Update the layout
    fig.update_layout(
        title={
            'text': 'Internet Usage Growth by Country (Sorted by Total Usage)',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        xaxis_title="Internet Usage (%)",
        yaxis_title="Country",
        xaxis_ticksuffix="%",
        barmode='stack',
        template='plotly_white',
        height=600,
        margin=dict(t=100, l=200, r=40, b=80),
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    
    # Add annotations for final values
    for i, r in enumerate(results):
        fig.add_annotation(
            x=r['end_value'],
            y=i,
            text=f"Total: {r['end_value']:.1f}%",
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor="#636363",
            ax=40,
            ay=0,
            font=dict(size=12),
            xanchor="left"
        )
    
    return fig

# Use the function
fig = plot_internet_usage_comparison(country_internet_usage)
fig.show()

Saudi Arabia: 97.80% increase (2000 to 2021: 2.2% → 100.0%)
Qatar: 95.10% increase (2000 to 2021: 4.9% → 100.0%)
Bahrain: 93.80% increase (2000 to 2021: 6.2% → 100.0%)
Kuwait: 93.00% increase (2000 to 2021: 6.7% → 99.7%)
Oman: 92.90% increase (2000 to 2021: 3.5% → 96.4%)
Kazakhstan: 90.20% increase (2000 to 2021: 0.7% → 90.9%)
Antigua and Barbuda: 89.20% increase (2000 to 2021: 6.5% → 95.7%)
Brunei Darussalam: 89.10% increase (2000 to 2021: 9.0% → 98.1%)
Morocco: 87.40% increase (2000 to 2021: 0.7% → 88.1%)
Bahamas: 86.30% increase (2000 to 2021: 8.0% → 94.3%)


**What is the expected internet penetration per Region by the end of 2024?**

In [60]:
from sklearn.linear_model import LinearRegression

def forecast_internet_usage(df, region, end_year=2024):
    """
    Forecasts internet usage for a given region up to the specified end year using linear regression.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing historical internet usage data.
    region (str): The region for which to forecast internet usage.
    end_year (int): The year up to which to forecast internet usage.
    
    Returns:
    pandas.DataFrame: DataFrame containing the forecasted internet usage.
    """
    # Filter data for the specified region
    region_data = df[df['Region_Country_Area'] == region]
    
    # Prepare the data for linear regression
    X = region_data['Year'].values.reshape(-1, 1)
    y = region_data['internet_usage(%)'].values
    
    # Create and fit the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Generate predictions for the specified end year
    future_years = np.arange(region_data['Year'].max() + 1, end_year + 1).reshape(-1, 1)
    future_predictions = model.predict(future_years)
    
    # Create a DataFrame with the forecasted data
    forecast_df = pd.DataFrame({
        'Region_Country_Area': region,
        'Year': future_years.flatten(),
        'internet_usage(%)': future_predictions
    })
    
    return forecast_df

# List of regions to forecast
regions_to_forecast = collective_internet_usage['Region_Country_Area'].unique()

# Forecast internet usage for each region
forecasted_data = pd.concat([forecast_internet_usage(collective_internet_usage, region) for region in regions_to_forecast])


# Plot the forecasted data
fig = px.bar(
    forecasted_data,
    x='internet_usage(%)',
    y='Region_Country_Area',
    color='Year',
    orientation='h',
    title='Forecasted Internet Usage by Region (2022-2024)',
    barmode='stack',
    color_continuous_scale = 'Turbo'
)

fig.update_layout(
    xaxis_title='Internet Usage (%)',
    yaxis_title='Region',
    legend_title='Year'
)

fig.show()


**How the world countries were increasing internet connectivity (2000-2021) on map**

In [72]:

# Create the choropleth map with animation  
fig = px.choropleth(  
    country_internet_usage,   
    locations="Region_Country_Area",   
    locationmode='country names',   
    color="internet_usage(%)",  
    hover_name="Region_Country_Area",   
    animation_frame="Year",     
    title='World Internet Usage across Regions over the Years',  
    color_continuous_scale=px.colors.sequential.Plasma,  
    width=1000,   
    height=600,  
    projection='natural earth'  
)  

# Update layout to set animation speed  
fig.update_layout(  
    updatemenus=[{  
        'buttons': [  
            {  
                'args': [None, {'frame': {'duration': 1000, 'redraw': True}, 'mode': 'immediate', 'transition': {'duration': 100}}],  
                'label': 'Play',  
                'method': 'animate'  
            },  
            {  
                'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate', 'transition': {'duration': 0}}],  
                'label': 'Pause',  
                'method': 'animate'  
            }  
        ],  
        'direction': 'left',  
        'pad': {'r': 10, 't': 87},  
        'showactive': False,  
        'type': 'buttons',  
        'x': 0.1,  
        'xanchor': 'right',  
        'y': 0,  
        'yanchor': 'top'  
    }]  
)  

# Show the figure  
fig.show()