# Population and density project

In [None]:
#Importing all essential liabraries
import numpy as np
import pandas as pd
import seaborn as sns
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno
import plotly.offline as py
py.init_notebook_mode(connected=True)

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor



import warnings
warnings.filterwarnings('ignore')

## Data exploration:

In [None]:
#Upload the data we have recieved from crawling.

df = pd.read_csv('final_data.csv')
df.head()

In [None]:
#Total size of the data before cleaning.

df.shape

In [None]:
df.isnull().sum()

In [None]:
# Although we had this specific year growth rate, we couldnt find more specific by years growth rate.
# So for this project we will only use the 5 years growth rate that we have.

df.drop('2009 growth rate', axis=1, inplace=True)

#Removing the rows that missing the most important data for the project.
#which is the Area of the city and the growth rate.
#as you can see this have made the data "clean", there is no more null in this dataframe.

df.dropna(subset=['Area'], inplace=True)
df.dropna(subset=['growth rate'], inplace=True)

#If there is missing data of a 5 year growht rate we fill it with the current growth rate.

df['2005-2010 growth rate'] = df['2005-2010 growth rate'].fillna(df['growth rate'])
df['2010-2015 growth rate'] = df['2010-2015 growth rate'].fillna(df['growth rate'])
df['2015-2020 growth rate'] = df['2015-2020 growth rate'].fillna(df['growth rate'])

#checking the data state after removing.

df.isnull().sum()

In [None]:
#Getting all the unique values in each feature
features = df.columns
for feature in features:
    print(f"{feature} ---> {df[feature].nunique()}")

#### Feature Description:
1. City - The city name
2. Country - The country name
3. Population - The city's population
4. Area - The city's size by sq km
5. Country population - The country's population
6. Country area - The country's size by sq km
7. Country density - The country's density by persons/sq km
8. Growth rate - The country's growth rate (2023)
9. 2005-2010 growth rate - The country's growth rate in the years 2005-2010
10. 2010-2015 growth rate - The country's growth rate in the years 2010-2015
11. 2015-2020 growth rate - The country's growth rate in the years 2015-2020

In [None]:
fig = px.choropleth(df,
                    locations='Country',
                    locationmode='country names',
                    color='country population',  # we indicate the year we are interested in
                    hover_name='Country',                    
                    title = 'World population by country',
                    color_continuous_scale='Viridis'
                    )
fig.show()

In [None]:
X = df[['Area', 'country population', 'country area', 'country density']]
y = df['Population']

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

plt.scatter(df.index, y, color='blue', label='Actual Population')
plt.plot(df.index, y_pred, color='red', label='Predicted Population')
plt.xlabel('Index')
plt.ylabel('Population')
plt.title('Linear Regression Model')
plt.legend()
plt.show()

In [None]:
X = df[['Area', 'country population', 'country area', 'country density']]
y = df['Population']

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

y_pred = model.predict(X)

plt.scatter(df.index, y, color='blue', label='Actual Population')
plt.plot(df.index, y_pred, color='red', label='Predicted Population')
plt.xlabel('Index')
plt.ylabel('Population')
plt.title('Random Forest Regression Model')
plt.legend()
plt.show()

In [None]:
fig = px.choropleth(df,
                    locations='Country',
                    locationmode='country names',
                    color='country density',
                    hover_name='Country',                    
                    title = 'World density',
                    color_continuous_scale='amp'
                    )
fig.show()

In [None]:
df = df[['Country', 'country population', 'country area', 'country density', 'growth rate']]

df = df.drop_duplicates(subset='Country')

predictions_df = pd.DataFrame(df['Country'])

X = df[['country population', 'country area', 'country density']]
y = df['growth rate']

model = LinearRegression()
model.fit(X, y)

predictions_df['Predicted Population 2050'] = df['country population'] * (1 + model.predict(X))
predictions_df['Predicted Density 2050'] = predictions_df['Predicted Population 2050'] / df['country area']

print(predictions_df)

In [None]:
fig = px.choropleth(predictions_df,
                    locations='Country',
                    locationmode='country names',
                    color='Predicted Density 2050',
                    hover_name='Country',                    
                    title = 'Predicted World density in 2050',
                    color_continuous_scale='amp'
                    )
fig.show()