In [3]:
import requests
from selenium import webdriver 
from bs4 import BeautifulSoup 
import time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import numpy as np
import platform

if platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname = path).get_name()
    rc('font', family = font_name)
elif platform.system() == 'Darwin':
    rc('font', family = 'AppleGothic')
else:
    print('Check your OS system')

In [4]:
url='https://www.un.org/en/about-us/member-states'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')

In [5]:
print(soup)


<!DOCTYPE html>

<html dir="ltr" lang="en"><head profile="http://www.w3.org/1999/xhtml/vocab">
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta content="" name="description"/>
<meta content="United Nations" name="author"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="Drupal 7 (http://drupal.org)" name="Generator"/>
<link href="/en/about-us/member-states" rel="canonical"/>
<link href="/en/node/119289" rel="shortlink"/>
<link href="https://www.un.org/sites/un2.un.org/themes/bootstrap_un2/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<title>Member States | United Nations</title>
<meta content="Member States | United Nations" name="DC.Title"/>
<meta content="" name="DC.Description"/>
<meta content="Unit

In [6]:
country_names = soup.select('.mb-0')
print(country_names)

[<h2 class="mb-0">Afghanistan</h2>, <h2 class="mb-0">Albania</h2>, <h2 class="mb-0">Algeria</h2>, <h2 class="mb-0">Andorra</h2>, <h2 class="mb-0">Angola</h2>, <h2 class="mb-0">Antigua and Barbuda</h2>, <h2 class="mb-0">Argentina</h2>, <h2 class="mb-0">Armenia</h2>, <h2 class="mb-0">Australia</h2>, <h2 class="mb-0">Austria</h2>, <h2 class="mb-0">Azerbaijan</h2>, <h2 class="mb-0">Bahamas</h2>, <h2 class="mb-0">Bahrain</h2>, <h2 class="mb-0">Bangladesh</h2>, <h2 class="mb-0">Barbados</h2>, <h2 class="mb-0"><a href="https://www.un.org/en/about-us/member-states/belarus">Belarus</a></h2>, <h2 class="mb-0">Belgium</h2>, <h2 class="mb-0">Belize</h2>, <h2 class="mb-0"><a href="https://www.un.org/en/about-us/member-states/benin">Benin</a></h2>, <h2 class="mb-0">Bhutan</h2>, <h2 class="mb-0"><a href="https://www.un.org/en/about-us/member-states/bolivia">Bolivia (Plurinational State of)</a></h2>, <h2 class="mb-0"><a href="https://www.un.org/en/about-us/member-states/yugoslavia">Bosnia and Herzegov

In [7]:
country_list = []
for i in country_names:
    country_list.append(i.text)
    
country_list

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Costa Rica',
 "Côte D'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 "Democratic People's Republic of Korea",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia (Republic of The)',
 'Georgia',
 'Germany',
 'Ghana',
 '

In [8]:
col = ['country']
country_df = pd.DataFrame(country_list, columns=col)
country_df

Unnamed: 0,country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
...,...
188,"Venezuela, Bolivarian Republic of"
189,Viet Nam
190,Yemen
191,Zambia


In [9]:
country_df.iloc[182, 0] = 'United Kingdom'
country_df.iloc[182]

country    United Kingdom
Name: 182, dtype: object

In [10]:
gdp_cap = pd.read_excel('gdp_per_capita.xls')
gdp_cap

Unnamed: 0,Country Name,2018,2019
0,Afghanistan,493.756581,507.103392
1,Africa Eastern and Southern,1530.161917,1481.425292
2,Africa Western and Central,1695.959215,1772.339155
3,Albania,5284.380184,5355.847795
4,Algeria,4153.956234,3975.508993
...,...,...,...
261,West Bank and Gaza,3562.330943,3656.858271
262,World,11372.727329,11417.174060
263,Yemen,824.117718,
264,Zambia,1516.371100,1305.002214


In [70]:
gdp_cap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  266 non-null    object 
 1   2018          254 non-null    float64
 2   2019          247 non-null    float64
dtypes: float64(2), object(1)
memory usage: 6.4+ KB


In [11]:
gdp_df = pd.merge(left=country_df,
                 right=gdp_cap,
                 how='left',
                 left_on='country',
                 right_on='Country Name')
gdp_df

Unnamed: 0,country,Country Name,2018,2019
0,Afghanistan,Afghanistan,493.756581,507.103392
1,Albania,Albania,5284.380184,5355.847795
2,Algeria,Algeria,4153.956234,3975.508993
3,Andorra,Andorra,41791.969837,40897.330873
4,Angola,Angola,3289.643995,2809.626088
...,...,...,...,...
188,"Venezuela, Bolivarian Republic of","Venezuela, Bolivarian Republic of",,
189,Viet Nam,Viet Nam,2566.447487,2715.275980
190,Yemen,Yemen,824.117718,
191,Zambia,Zambia,1516.371100,1305.002214


In [72]:
gdp_df.info()
# un 가입국이지만 gdp 정보가 없는 국가 3개
# 2018년, 2019년 모두 gdp 정보가 없는 국가 drop

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 0 to 192
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       193 non-null    object 
 1   Country Name  190 non-null    object 
 2   2018          186 non-null    float64
 3   2019          184 non-null    float64
dtypes: float64(2), object(2)
memory usage: 7.5+ KB


In [12]:
df_nulls = gdp_df.isnull().sum()
df_nulls

country         0
Country Name    3
2018            7
2019            9
dtype: int64

In [13]:
# thresh: 해당 row에서 NaN이 아닌 값이 최소 3개 이상 나와야 한다는 설정
df_thresh = gdp_df.dropna(axis = 0, thresh = 3)
df_thresh

Unnamed: 0,country,Country Name,2018,2019
0,Afghanistan,Afghanistan,493.756581,507.103392
1,Albania,Albania,5284.380184,5355.847795
2,Algeria,Algeria,4153.956234,3975.508993
3,Andorra,Andorra,41791.969837,40897.330873
4,Angola,Angola,3289.643995,2809.626088
...,...,...,...,...
187,Vanuatu,Vanuatu,3125.404999,3102.346790
189,Viet Nam,Viet Nam,2566.447487,2715.275980
190,Yemen,Yemen,824.117718,
191,Zambia,Zambia,1516.371100,1305.002214


In [14]:
df_nulls2 = df_thresh.isnull().sum()
df_nulls2

country         0
Country Name    0
2018            0
2019            2
dtype: int64

In [15]:
gdp_pop = df_thresh
gdp_pop

Unnamed: 0,country,Country Name,2018,2019
0,Afghanistan,Afghanistan,493.756581,507.103392
1,Albania,Albania,5284.380184,5355.847795
2,Algeria,Algeria,4153.956234,3975.508993
3,Andorra,Andorra,41791.969837,40897.330873
4,Angola,Angola,3289.643995,2809.626088
...,...,...,...,...
187,Vanuatu,Vanuatu,3125.404999,3102.346790
189,Viet Nam,Viet Nam,2566.447487,2715.275980
190,Yemen,Yemen,824.117718,
191,Zambia,Zambia,1516.371100,1305.002214


In [16]:
del gdp_pop['Country Name']
gdp_pop

Unnamed: 0,country,2018,2019
0,Afghanistan,493.756581,507.103392
1,Albania,5284.380184,5355.847795
2,Algeria,4153.956234,3975.508993
3,Andorra,41791.969837,40897.330873
4,Angola,3289.643995,2809.626088
...,...,...,...
187,Vanuatu,3125.404999,3102.346790
189,Viet Nam,2566.447487,2715.275980
190,Yemen,824.117718,
191,Zambia,1516.371100,1305.002214


In [17]:
gdp_pop = gdp_pop.reset_index()


In [20]:
del gdp_pop['index']
gdp_pop

Unnamed: 0,country,2018,2019
0,Afghanistan,493.756581,507.103392
1,Albania,5284.380184,5355.847795
2,Algeria,4153.956234,3975.508993
3,Andorra,41791.969837,40897.330873
4,Angola,3289.643995,2809.626088
...,...,...,...
181,Vanuatu,3125.404999,3102.346790
182,Viet Nam,2566.447487,2715.275980
183,Yemen,824.117718,
184,Zambia,1516.371100,1305.002214


In [21]:
gdp_pop[gdp_pop['2019'].isnull()]

Unnamed: 0,country,2018,2019
94,Liechtenstein,180366.715198,
183,Yemen,824.117718,


In [22]:
# 2019 nan값을 2018 정보로 대체
gdp_pop.iloc[94, 2] = gdp_pop.iloc[94, 1]
gdp_pop.iloc[183, 2] = gdp_pop.iloc[183, 1]
gdp_pop.iloc[94]

country    Liechtenstein
2018              180367
2019              180367
Name: 94, dtype: object

In [23]:
gdp_pop.iloc[183]

country      Yemen
2018       824.118
2019       824.118
Name: 183, dtype: object

In [93]:
del gdp_pop['2018']

KeyError: '2018'

In [94]:
gdp_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  186 non-null    object 
 1   2019     186 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.0+ KB


In [98]:
# 데이터 타입 변환
gdp_pop = gdp_pop.astype({'2019':'int64'})
gdp_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  186 non-null    object
 1   2019     186 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.0+ KB


In [99]:
gdp_pop

Unnamed: 0,country,2019
0,Afghanistan,507
1,Albania,5355
2,Algeria,3975
3,Andorra,40897
4,Angola,2809
...,...,...
181,Vanuatu,3102
182,Viet Nam,2715
183,Yemen,824
184,Zambia,1305
