# **World Population Data Analysis**


In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time
import joblib



In [None]:
# load data
df = pd.read_csv('/content/World Population by country 2024.csv')
df.head()

Unnamed: 0,Country,Population 2024,Population 2023,Area (km2),Density (/km2),Growth Rate,World %,World Rank
0,India,1441719852,1428627663,3M,485.0,0.0092,0.1801,1
1,China,1425178782,1425671352,9.4M,151.0,-0.0003,0.178,2
2,United States,341814420,339996563,9.1M,37.0,0.0053,0.0427,3
3,Indonesia,279798049,277534122,1.9M,149.0,0.0082,0.035,4
4,Pakistan,245209815,240485658,770.9K,318.0,0.0196,0.0306,5


In [None]:
# Check the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country          234 non-null    object 
 1   Population 2024  234 non-null    int64  
 2   Population 2023  234 non-null    int64  
 3   Area (km2)       234 non-null    object 
 4   Density (/km2)   234 non-null    float64
 5   Growth Rate      234 non-null    float64
 6   World %          228 non-null    float64
 7   World Rank       234 non-null    int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 14.8+ KB


In [None]:
# describe the data
df.describe()

Unnamed: 0,Population 2024,Population 2023,Density (/km2),Growth Rate,World %,World Rank
count,234.0,234.0,234.0,234.0,228.0,234.0
mean,34688620.0,34374420.0,453.788248,0.0092,0.004446,117.5
std,138075000.0,137386400.0,1990.163274,0.011371,0.017459,67.694165
min,526.0,518.0,0.14,-0.0309,0.0,1.0
25%,426456.5,422598.2,39.5,0.001925,0.0001,59.25
50%,5626359.0,5643895.0,98.5,0.00795,0.00075,117.5
75%,23922720.0,23245370.0,248.25,0.015675,0.003,175.75
max,1441720000.0,1428628000.0,21674.0,0.0483,0.1801,234.0


In [None]:
def convert_area(area):
    if isinstance(area, str):
        area = area.strip().replace('<', '').strip()  # Remove any '<' characters and trim spaces
        area = area.replace(',', '')  # Remove any commas
        if 'M' in area:
            return float(area.replace('M', '')) * 1_000_000
        elif 'K' in area:
            return float(area.replace('K', '')) * 1_000
        else:
            return float(area)
    return area

df['Area (km2)'] = df['Area (km2)'].apply(convert_area)


In [None]:
df.head()

Unnamed: 0,Country,Population 2024,Population 2023,Area (km2),Density (/km2),Growth Rate,World %,World Rank
0,India,1441719852,1428627663,3000000.0,485.0,0.0092,0.1801,1
1,China,1425178782,1425671352,9400000.0,151.0,-0.0003,0.178,2
2,United States,341814420,339996563,9100000.0,37.0,0.0053,0.0427,3
3,Indonesia,279798049,277534122,1900000.0,149.0,0.0082,0.035,4
4,Pakistan,245209815,240485658,770900.0,318.0,0.0196,0.0306,5


In [None]:
# Initialize geolocator and rate limiter
geolocator = Nominatim(user_agent="world_population_analysis")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [None]:
# Cache for geocoding results
geo_cache = {}

def get_geocode(country):
    if country in geo_cache:
        return geo_cache[country]
    else:
        location = geocode(country)
        geo_cache[country] = location
        time.sleep(1)  # Respect rate limit
        return location

In [None]:
# apply location as geocode
df['location'] = df['Country'].apply(get_geocode)



In [None]:
df['Latitude'] = df['location'].apply(lambda loc: loc.latitude if loc else None)
df['Longitude'] = df['location'].apply(lambda loc: loc.longitude if loc else None)

In [None]:
# Drop rows with missing coordinates
df = df.dropna(subset=['Latitude', 'Longitude'])

In [None]:
# check the missing values
df.isnull().sum()

Country            0
Population 2024    0
Population 2023    0
Area (km2)         0
Density (/km2)     0
Growth Rate        0
World %            6
World Rank         0
location           0
Latitude           0
Longitude          0
dtype: int64

In [None]:
# calculate the change of population
df['Population Change'] = (df['Population 2024'] - df['Population 2023']) / df['Population 2023'] * 100
print(df.head())

         Country  Population 2024  Population 2023  Area (km2)  \
0          India       1441719852       1428627663   3000000.0   
1          China       1425178782       1425671352   9400000.0   
2  United States        341814420        339996563   9100000.0   
3      Indonesia        279798049        277534122   1900000.0   
4       Pakistan        245209815        240485658    770900.0   

   Density (/km2)  Growth Rate  World %  World Rank  \
0           485.0       0.0092   0.1801           1   
1           151.0      -0.0003   0.1780           2   
2            37.0       0.0053   0.0427           3   
3           149.0       0.0082   0.0350           4   
4           318.0       0.0196   0.0306           5   

                                     location   Latitude   Longitude  \
0           (India, (22.3511148, 78.6677428))  22.351115   78.667743   
1               (中国, (35.000074, 104.999927))  35.000074  104.999927   
2  (United States, (39.7837304, -100.445882))  39.783730

In [None]:
# check top 15 country growth rate by using plot
plt.figure(figsize=(10, 6))
fig = px.bar(df.nlargest(15, 'Growth Rate').sort_values('Growth Rate'),
             x='Growth Rate', y='Country',
             title='Top 10 Countries by Population Growth Rate',
             labels={'Growth Rate': 'Growth Rate'},
             color='Growth Rate',
             color_continuous_scale=px.colors.sequential.Plasma,
             height=800)
fig.update_layout(xaxis_title='Growth Rate', yaxis_title='Country', template='plotly_dark')
fig.show()

<Figure size 1000x600 with 0 Axes>

In [None]:
# Calculate the population change
df['Population Change'] = df['Population 2024'] - df['Population 2023']

# Bar plot of the top 10 countries by population change
fig = px.bar(df.nlargest(15, 'Population Change').sort_values('Population Change'),
             x='Population Change', y='Country',
             title='Top 10 Countries by Population Change (2023 to 2024)',
             labels={'Population Change': 'Population Change'},
             color='Population Change',
             color_continuous_scale=px.colors.sequential.Greens,
             height=600)
fig.update_layout(xaxis_title='Population Change', yaxis_title='Country', template='plotly_dark')
fig.show()

In [None]:
# Filter the data for top country
df_india = df[df['Country'] == 'India']
df_china = df[df['Country'] == 'China']
df_pak = df[df['Country'] == 'Pakistan']
df_usa = df[df['Country'] == 'United States']

# Combine the data
df_combined = pd.concat([df_india, df_china, df_pak, df_usa])

# Create the plot
fig = px.bar(df_combined, x='Population 2024', y='Population 2024', color='Country', title='Population Comparison between India and China')

# Customize the plot
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Population',
    legend_title='Country',
    template='plotly_white'
)

# Show the plot
fig.show()

In [None]:
fig = px.choropleth(df,
                    locations='Country',
                    locationmode='country names',
                    color='Population 2024',
                    hover_name='Country',
                    hover_data=['Population 2023', 'Area (km2)', 'Density (/km2)', 'Growth Rate', 'World %', 'World Rank'],
                    color_continuous_scale=px.colors.sequential.Plasma_r,
                    title='World Population 2024')
fig.update_layout(template='plotly_dark')
fig.show()

In [None]:
fig = px.choropleth(df,
                    locations='Country',
                    locationmode='country names',
                    color='Density (/km2)',
                    hover_name='Country',
                    hover_data=['Population 2024', 'Population 2023', 'Area (km2)', 'Growth Rate', 'World %', 'World Rank'],
                    color_continuous_scale=px.colors.sequential.PuBuGn,
                    title='World Population Density')
fig.update_layout(template='plotly_dark')
fig.show()

In [None]:
fig = px.scatter(df, x='World Rank', y='Growth Rate',
                 title='Growth Rate vs. World Rank',
                 labels={'World Rank': 'World Rank', 'Growth Rate': 'Growth Rate'},
                 hover_name='Country',
                 hover_data=['Population 2024', 'Population 2023', 'Area (km2)', 'Density (/km2)', 'World %'],
                 color='Growth Rate',
                 color_continuous_scale=px.colors.sequential.Sunset,
                 height=600)
fig.update_layout(xaxis_title='World Rank', yaxis_title='Growth Rate', template='plotly_dark')
fig.show()

In [None]:
fig = px.choropleth(df,
                    locations='Country',
                    locationmode='country names',
                    color='World %',
                    hover_name='Country',
                    hover_data=['Population 2024', 'Population 2023', 'Area (km2)', 'Density (/km2)', 'Growth Rate', 'World Rank'],
                    color_continuous_scale=px.colors.sequential.Sunset,
                    title='Percentage of World Population by Country')
fig.update_layout(template='plotly_dark')
fig.show()

In [None]:
# Folium map
map_population = folium.Map(location=[20, 0], zoom_start=2, tiles='cartodb dark_matter')

marker_cluster = MarkerCluster().add_to(map_population)
for idx, row in df.iterrows():
    folium.CircleMarker(
        location=(row['Latitude'], row['Longitude']),
        radius=5 + (row['Population 2024'] / 1_000_000),  # Scale marker size by population
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        popup=f"{row['Country']}<br>Population 2024: {row['Population 2024']}<br>Density: {row['Density (/km2)']}<br>Growth Rate: {row['Growth Rate']}",
    ).add_to(marker_cluster)

# Save and display the map
map_population.save('world_population_map.html')

In [None]:
df.columns

Index(['Country', 'Population 2024', 'Population 2023', 'Area (km2)',
       'Density (/km2)', 'Growth Rate', 'World %', 'World Rank', 'location',
       'Latitude', 'Longitude', 'Population Change'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            234 non-null    object 
 1   Population 2024    234 non-null    int64  
 2   Population 2023    234 non-null    int64  
 3   Area (km2)         234 non-null    float64
 4   Density (/km2)     234 non-null    float64
 5   Growth Rate        234 non-null    float64
 6   World %            228 non-null    float64
 7   World Rank         234 non-null    int64  
 8   location           234 non-null    object 
 9   Latitude           234 non-null    float64
 10  Longitude          234 non-null    float64
 11  Population Change  234 non-null    int64  
dtypes: float64(6), int64(4), object(2)
memory usage: 22.1+ KB


In [None]:
# Select features and target
features = ['Population 2024', 'Area (km2)', 'Density (/km2)', 'Growth Rate', 'World %', 'Latitude', 'Longitude']
target = 'Growth Rate'

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data Preprocessing: Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle NaN values - Imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median') # Replace NaNs with the median of the column
X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)

# Model Selection and Training
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))

# Make Predictions: Predict future growth rates
future_growth_rates = model.predict(X_test_scaled)

# Display predictions
df_predictions = X_test.copy()
df_predictions['Actual Growth Rate'] = y_test
df_predictions['Predicted Growth Rate'] = future_growth_rates

print(df_predictions.head())

Mean Squared Error: 3.1413442553191354e-07
R2 Score: 0.9967690611823383
     Population 2024  Area (km2)  Density (/km2)  Growth Rate  World %  \
69          18358430    107200.0           171.0       0.0147   0.0023   
206            63788        63.0          1013.0       0.0038   0.0000   
180           345996       374.0           925.0       0.0298   0.0000   
9          129719719   1100000.0           115.0       0.0252   0.0162   
127          4527961     74200.0            61.0       0.0134   0.0006   

      Latitude  Longitude  Actual Growth Rate  Predicted Growth Rate  
69   15.585555 -90.345759              0.0147               0.014629  
206  49.456623  -2.582235              0.0038               0.003831  
180 -12.823048  45.152076              0.0298               0.031241  
9    10.211670  38.652120              0.0252               0.025518  
127   8.559559 -81.130843              0.0134               0.013545  


In [28]:
# Make Predictions: Predict future growth rates
future_growth_rates = model.predict(X_test_scaled)

# Create a DataFrame with actual and predicted values
df_predictions = X_test.copy()
df_predictions['Actual Growth Rate'] = y_test.values
df_predictions['Predicted Growth Rate'] = future_growth_rates

# Plot actual vs predicted growth rates
fig = px.scatter(df_predictions, x='Actual Growth Rate', y='Predicted Growth Rate', trendline='ols',
                 title='Actual vs Predicted Growth Rates',
                 labels={'Actual Growth Rate': 'Actual Growth Rate', 'Predicted Growth Rate': 'Predicted Growth Rate'})

# Show the plot
fig.show()

In [None]:
# Save the model and the scaler
joblib.dump(model, 'population_growth_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']