# The strat of the new project

### Checking the data and cleaning if necesary

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
country_status = pd.read_csv('GLOB.SES.csv', encoding= 'unicode_escape') 
# I have to set encoding = unicode_escape in order to avoid and error that pops up

In [3]:
country_status.head()

Unnamed: 0,unid,wbid,country,year,SES,gdppc,yrseduc,popshare
0,4,AFG,Afghanistan,1970,3.474212,709.0,,0.003097
1,4,AFG,Afghanistan,1920,26.968016,731.75677,,0.003245
2,4,AFG,Afghanistan,1990,1.26953,604.0,,0.002347
3,4,AFG,Afghanistan,1960,15.763076,739.0,,0.003039
4,4,AFG,Afghanistan,2000,2.061114,565.0,,0.003309


#### What does each column mean?

- UNID: ISO numeric country code (used by the United Nations)

- WBID: ISO alpha country code (used by the World Bank)

- SES: Socioeconomic status score (percentile) based on GDP per capita and
educational attainment (n=174)

- country: Short country name

- year: Survey year

- SES: Socioeconomic status score (1-99) for each of 174 countries

- gdppc: GDP per capita: Single time-series (imputed)

- yrseduc: Completed years of education in the adult (15+) population

- popshare: Total population shares

In [4]:
country_status.shape

(2086, 8)

In [5]:
country_status.dtypes

unid          int64
wbid         object
country      object
year          int64
SES         float64
gdppc       float64
yrseduc     float64
popshare    float64
dtype: object

In [6]:
country_status.isna().sum()
# we have 50% of nulls in yrseduc, this is a problem

unid           0
wbid           0
country        0
year           0
SES            0
gdppc          0
yrseduc     1050
popshare       0
dtype: int64

This column reflects the years of education a person has completed after 15 years old (in average of the country and year)

In [7]:
country_status['yrseduc']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
2081   NaN
2082   NaN
2083   NaN
2084   NaN
2085   NaN
Name: yrseduc, Length: 2086, dtype: float64

How effective and real is if we fill this NaN's with the mean values of the column?

In [8]:
country_status['yrseduc'].fillna(country_status['yrseduc'].mean(), inplace=True)

In [9]:
country_status['yrseduc'].value_counts()

3.778166    1050
0.070000      11
0.130000      10
0.040000       8
0.120000       6
            ... 
6.870000       1
6.490000       1
7.296700       1
8.213800       1
2.256400       1
Name: yrseduc, Length: 783, dtype: int64

Its a poor metric actually, becasue we have quite variance within this column, so we are going to ignire it a little bit in the study and after that we will try to do something with it

### The data is cleaner now

## Now we shall start exploring each country and each year individually

In [10]:
country_status.head()

Unnamed: 0,unid,wbid,country,year,SES,gdppc,yrseduc,popshare
0,4,AFG,Afghanistan,1970,3.474212,709.0,3.778166,0.003097
1,4,AFG,Afghanistan,1920,26.968016,731.75677,3.778166,0.003245
2,4,AFG,Afghanistan,1990,1.26953,604.0,3.778166,0.002347
3,4,AFG,Afghanistan,1960,15.763076,739.0,3.778166,0.003039
4,4,AFG,Afghanistan,2000,2.061114,565.0,3.778166,0.003309


In [12]:
country_grouped = country_status.groupby(['country'], as_index=False).agg({'gdppc': 'sum'})

In [13]:
country_grouped

Unnamed: 0,country,gdppc
0,Afghanistan,10367.96817
1,Albania,32098.77122
2,Algeria,49871.57671
3,Angola,24737.74378
4,Argentina,92781.64700
...,...,...
144,Venezuela,95109.18200
145,Vietnam,16208.29056
146,Yemen,23752.67057
147,Zambia,15843.39048


In [17]:
country_grouped.sort_values(by = 'gdppc', ascending=False, inplace=True)

In [18]:
fig1 = go.Figure([go.Bar(x=country_grouped.country, y=country_grouped.gdppc)])
fig1.update_layout(
    autosize=False,
    width=1100,
    height=800
)
fig1.update_layout(title='TOTAL GDPPC FOR EACH COUNTRY')
fig1.update_xaxes(tickangle=45)

fig1.show()

### Now we want to check the evolution of each country within the time periods where we hve the records of the gdppc

In [23]:
afg = country_status[country_status['country'] == 'Afghanistan'].sort_values(by='year', ascending=True)

In [24]:
afg

Unnamed: 0,unid,wbid,country,year,SES,gdppc,yrseduc,popshare
6,4,AFG,Afghanistan,1880,37.957447,585.46509,3.778166,0.003271
8,4,AFG,Afghanistan,1890,29.591391,635.93024,3.778166,0.003255
12,4,AFG,Afghanistan,1900,28.104797,686.39532,3.778166,0.003245
13,4,AFG,Afghanistan,1910,26.9876,736.86047,3.778166,0.003178
1,4,AFG,Afghanistan,1920,26.968016,731.75677,3.778166,0.003245
10,4,AFG,Afghanistan,1930,15.306766,702.83783,3.778166,0.003259
7,4,AFG,Afghanistan,1940,14.320977,673.91895,3.778166,0.003226
11,4,AFG,Afghanistan,1950,23.424145,645.0,3.778166,0.003302
3,4,AFG,Afghanistan,1960,15.763076,739.0,3.778166,0.003039
0,4,AFG,Afghanistan,1970,3.474212,709.0,3.778166,0.003097


In [31]:
countries = country_status['country'].unique()

In [32]:
def create_df (country):
    df = country_status[country_status['country'] == country].sort_values(by='year', ascending=True)
    return df

In [70]:
def graph (country):
    df = create_df(country)
    fig1 = go.Figure([go.Bar(x=df.year, y=df.gdppc)])
    fig1.update_layout(
    autosize=False,
    width=900,
    height=600
    )
    fig1.update_layout(title=f' Gdppc for each year for {country}')
    fig1.update_xaxes(tickangle=45)

    fig1.show()

In [71]:
graph('Spain')

### Now we have the ability to print a graph of any country we want

We are now going to plot all the evolution of the countries within the years in this study