In [1]:
import pandas as pd
import numpy as np
import random
from string import ascii_letters
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
%matplotlib inline

In [2]:
pop_iq = pd.read_csv('data/iq_pop.csv')
pop_sj = pd.read_csv('data/sj_pop.csv')

### San Juan

In [3]:
pop_sj

Unnamed: 0,Year,Estimated_population
0,1990,2217968
1,1999,2453157
2,2000,2389397
3,2001,2395941
4,2002,2400500
5,2003,2403420
6,2004,2405376
7,2005,2403542
8,2006,2394920
9,2007,2382377


On fait une régression linéaire très simple pour combler le manque de données entre 1990 et 1999

In [4]:
years = list(range(1991,1999))
years

[1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998]

In [5]:
X_train = np.array(pop_sj[pop_sj["Year"]<=1999]["Year"]).reshape(-1,1)
y_train = pop_sj[pop_sj["Year"]<=1999]["Estimated_population"]
X_test = np.array(years).reshape(-1,1)

In [6]:
from sklearn.linear_model import LinearRegression
lg = LinearRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)

array([2244100.11111111, 2270232.22222222, 2296364.33333334,
       2322496.44444445, 2348628.55555556, 2374760.66666667,
       2400892.77777778, 2427024.8888889 ])

In [7]:
df = pd.DataFrame()
df["Year"] = years
df["Estimated_population"] = y_pred

In [8]:
pop_sj = pop_sj.append(df)
pop_sj = pop_sj.sort_values(by=["Year"])

On échelle la population de l'agglo pour retrouver celle de la ville elle-même  
En 2010 la ville intra Muros de San Juan fait 395.326 habitants.

In [9]:
ratio = 395326/pop_sj[pop_sj['Year']==2010].iloc[0]["Estimated_population"]
ratio

0.16837909680969643

In [10]:
pop_sj["Estimated_population"] = ratio*pop_sj["Estimated_population"]

In [11]:
pop_sj = pop_sj.reset_index(drop=True)
pop_sj["city"] = "sj"
pop_sj["density"] = pop_sj["Estimated_population"]/123.85    #Iquitos a une superficie de 123.85km²
pop_sj = pop_sj.astype({"density":"int32", "Estimated_population": 'int32'})

Unnamed: 0,Year,Estimated_population,city,density
0,1990,373459,sj,3015
1,1991,377859,sj,3050
2,1992,382259,sj,3086
3,1993,386659,sj,3122
4,1994,391059,sj,3157
5,1995,395459,sj,3193
6,1996,399860,sj,3228
7,1997,404260,sj,3264
8,1998,408660,sj,3299
9,1999,413060,sj,3335


### Iquitos

In [12]:
pop_iq["city"] = "iq"
pop_iq["density"] = pop_iq["Estimated_population"]/368.9   #Iquitos a une superficie de 368.9km²
pop_iq = pop_iq.astype({"density":"int32", "Estimated_population": 'int32'})

Unnamed: 0,Year,Estimated_population,city,density
0,2000,386666,iq,1048
1,2001,393355,iq,1066
2,2002,399770,iq,1083
3,2003,405988,iq,1100
4,2004,412095,iq,1117
5,2005,418168,iq,1133
6,2006,424247,iq,1150
7,2007,430268,iq,1166
8,2008,436185,iq,1182
9,2009,441939,iq,1197


### Pop General

In [13]:
pop_gen = pd.concat([pop_iq, pop_sj])
pop_gen = pop_gen.reset_index(drop=True)
pop_gen = pop_gen.rename(columns={"Estimated_population": "population", "Year": "year"})
pop_gen.head()

Unnamed: 0,year,population,city,density
0,2000,386666,iq,1048
1,2001,393355,iq,1066
2,2002,399770,iq,1083
3,2003,405988,iq,1100
4,2004,412095,iq,1117
5,2005,418168,iq,1133
6,2006,424247,iq,1150
7,2007,430268,iq,1166
8,2008,436185,iq,1182
9,2009,441939,iq,1197


### Sauvegarde en csv

In [14]:
pop_iq.to_csv('data/pop_iq_processed.csv', index=False)
pop_sj.to_csv('data/pop_sj_processed.csv', index=False)
pop_gen.to_csv('data/pop_gen_processed.csv', index=False)