In [48]:
import pandas as pd
import numpy as np

# set up display area to show dataframe in jupyter qtconsole

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)


# Filter all warnings. If you would like to see the warnings, please comment the two lines below.
import warnings
warnings.filterwarnings('ignore')

In [208]:
def answer_one():
    Energy = pd.read_excel("assets/Energy Indicators.xls", usecols=[2,3,4,5], skipfooter=1)
    GDP = pd.read_csv("assets/world_bank.csv", skiprows=4)
    ScimEn = pd.read_excel("assets/scimagojr-3.xlsx").set_index("Country")

    # set proper nans
    Energy = Energy.replace(['NaN','...'], np.nan)

    # set columns
    Energy.columns=['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

    # remove dirty outliers
    Energy = Energy.drop([8,9,15,16])

    # remove dirty header text
    rows_to_omit_based_on_renewables = ['Last update: December 2015',
                                        'website: http://unstats.un.org/unsd/ENVIRONMENT/qindicators.htm']

    rows_to_omit_based_on_country = ['Environmental Indicators: Energy',
                                     'Energy Supply and Renewable Electricity Production in 2013',
                                     'Choose a country from the following drop-down list:',
                                     'Excludes the overseas territories.',
           'Data exclude Hong Kong and Macao Special Administrative Regions (Hong Kong SAR and Macao SAR) and Taiwan Province.',
           'Data on kerosene-type jet fuel include aviation gasoline and other kerosene.',
           'For confidentiality reasons, data on coal and coal products, jet fuel, petroleum coke and other petroleum products (2009-2012), stock changes of other kerosene (2010-12), exports of charcoal (2009-2012), and data on fuelwood (2010-2012) are not available.',
           'Data exclude Greenland and the Danish Faroes.',
           'Data include Monaco, and exclude the following overseas departments and territories: Guadeloupe, Guyana, Martinique, New Caledonia, French Polynesia, Reunion, and St. Pierre and Miquelon.',
           'Data for kerosene-type jet fuel include other kerosene.',
           'Data include Timor-Leste until 2001.',
           'Data include San Marino and the Holy See. ',
           'Data include Okinawa.',
           'The data for crude oil production include 50 per cent of the output of the Neutral Zone. ',
           'Data exclude Suriname and the Netherlands Antilles.',
           'Data include the Azores and Madeira.',
           'Data for crude oil production include 50 per cent of the output of the Neutral Zone.',
           'Data exclude Kosovo from 2000 onwards.',
           'Data include the Canary Islands.',
           'Data include Liechtenstein for oil statistics.',
           'For confidentiality reasons, data on the following products (mainly on production) may not be available or may be included with other products: lignite (included with peat), patent fuel (included with peat briquettes), aviation gasoline, kerosene-type jet fuel, other kerosene, petroleum coke, paraffin waxes and white spirit (the latter three included with other oil products).',
           'Shipments of coal and oil to Jersey, Guernsey and the Isle of Man from the United Kingdom are not classed as exports. Supplies of coal and oil to these islands are, therefore, included as part of UK supply. Exports of natural gas to the Isle of Man are included with the exports to Ireland. ',
           'Includes the 50 states and the District of Columbia. Oil statistics as well as coal trade statistics also include Puerto Rico, Guam, the U.S. Virgin Islands, American Samoa, Johnston Atoll, Midway Islands, Wake Island and the Northern Mariana Islands. ']
    Energy = Energy[~Energy.Country.isin(rows_to_omit_based_on_country)]
    Energy = Energy[~Energy["% Renewable"].isin(rows_to_omit_based_on_renewables)]

    # remove superscripts
    Energy.Country = Energy.Country.replace("\d+","",regex=True)

    # convert type to floats
    Energy["Energy Supply"] = pd.to_numeric(Energy["Energy Supply"])

    # convert Energy Supply Column to gigajoules
    Energy["Energy Supply"] = Energy["Energy Supply"]*1000000

    # Rename the following list of countries (for use in later questions):
    energy_country_subs = {"Republic of Korea": "South Korea",
    "United States of America": "United States",
    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
    "China, Hong Kong Special Administrative Region": "Hong Kong"}
    Energy = Energy.replace({'Country': energy_country_subs}) 

    # replace strip parentheses strings from countries, remove empty rows
    Energy = (Energy.replace({'Country': {' \(.+\)': ""}}, regex=True)
             .dropna(how='all')
             .set_index("Country"))

    # Rename the following list of countries (for use in later questions):
    gdp_country_subs = {"Korea, Rep.": "South Korea", 
    "Iran, Islamic Rep.": "Iran",
    "Hong Kong SAR, China": "Hong Kong"}
    GDP = (GDP.replace({'Country Name': gdp_country_subs})
           .rename(columns={'Country Name':'Country'})
           .set_index("Country"))


    # Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names). 
    # Use only the last 10 years (2006-2015) of GDP data and only the top 15 countries by Scimagojr 'Rank' 
    # (Rank 1 through 15).

    final_cols = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']

    drop_cols = ['Country Code','Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963',
           '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972',
           '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981',
           '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990',
           '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
           '2000', '2001', '2002', '2003', '2004', '2005', ]


    SciEn_Energy = pd.merge(ScimEn, Energy, how='right', left_index=True, right_index=True)
    global full_df # set as a global variable so you can access it later
    full_df = (pd.merge(SciEn_Energy, GDP, how='left', left_index=True, right_index=True)
                  .sort_values("Rank"))

    answer_one = (full_df[(full_df["Rank"]<16)]
                  .drop(drop_cols, axis=1))
    return answer_one

In [209]:
assert type(answer_one()) == pd.DataFrame, "Q1: You should return a DataFrame!"

assert answer_one().shape == (15,20), "Q1: Your DataFrame should have 20 columns and 15 entries!"


In [206]:
def answer_two():
    answer_two = len(full_df) - len(answer_one())
    return answer_two
    #raise NotImplementedError()

In [207]:
assert type(answer_two()) == int, "Q2: You should return an int number!"

### Question 3
What are the top 15 countries for average GDP over the last 10 years?

*This function should return a Series named `avgGDP` with 15 countries and their average GDP sorted in descending order.*

In [204]:
def answer_three():  
    # get sub-df of just GDP scores
    just_GDPs = answer_one()[["2006", "2007", "2008", "2009","2010", "2011","2012", "2013","2014", "2015"]]

    # use apply(means)
    just_GDPs["avgGDP"] = just_GDPs.apply(np.mean, axis=1)

    just_GDPs.sort_values(by="avgGDP", ascending=False, inplace=True)

    avgGDP = just_GDPs["avgGDP"]
    return avgGDP

In [205]:
assert type(answer_three()) == pd.Series, "Q3: You should return a Series!"

### Question 4
By how much had the GDP changed over the 10 year span for the country with the 6th largest average GDP?

*This function should return a single number.*

In [210]:
def answer_four():
    # locate the 6th largest average GDP, using the series from Question 3
    sixth_country = answer_three().index[(6-1)] # 0-indexed

    # get range of GDPs from 2006-2015
    UK_GDPs = just_GDPs.loc[sixth_country]
    answer_four = max(UK_GDPs) - min(UK_GDPs)
    answer_four = round(answer_four, 2)
    #return answer_four
return(answer_four)

'United Kingdom'

In [197]:
answer_four()

299285784765.14

### Question 5
What is the mean energy supply per capita?

*This function should return a single number.*

In [166]:
df = answer_one()
df["Energy Supply per Capita"].dropna().mean()

157.6

### Question 6
What country has the maximum % Renewable and what is the percentage?

*This function should return a tuple with the name of the country and the percentage.*

In [186]:
def answer_six():
    df = answer_one()
    max_renew = df[(df["% Renewable"] == df["% Renewable"].max())]
    answer = tuple([max_renew.index[0], max_renew["% Renewable"]])
    return(answer)

In [188]:
assert type(answer_six()) == tuple, "Q6: You should return a tuple!"

assert type(answer_six()[0]) == str, "Q6: The first element in your result should be the name of the country!"


### Question 7
Create a new column that is the ratio of Self-Citations to Total Citations. 
What is the maximum value for this new column, and what country has the highest ratio?

*This function should return a tuple with the name of the country and the ratio.*