# Data wrangling with Pandas exercise
* For this exercise we will be using the `listings.csv` data file.

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas.core.groupby import GroupBy
from pandas import Series
from pandas.io.formats.style import Styler

# Load in the data file using `pd.read_csv()`

In [None]:
# Load data here

df = pd.read_csv('data/listings.csv')
df

In [None]:
from collections.abc import Iterable
from functools import partial, reduce

def flatten(list): # Recursivesly flattens a list of lists (and other stuff) into a one-dimentional list.
    for sublist in list:
        if isinstance(sublist, Iterable) and not isinstance(sublist, (str, bytes)): # Strings and bytes are iterable, but we really do not want to iterate those.
            yield from flatten(sublist) # If there is a nested list, it will be flattened too.
        else:
            yield sublist # If the potential sublist cannot (or should not) be recursively processed, yield it as an item.

def pipe(input, *args): # A functional utility function which enables writing code structured similarly to the F# |> operator
    return reduce(lambda f, g: g(f),
                  flatten(args), # Flatten any lists so that the final list of functions is one-dimensional.
                  input) # Feed the first argument in as the input to the first function.

def format_dataframe(*args, **kwargs): 
    return lambda dataframe: dataframe.style.format(*args, **kwargs)

format_currency = '${:,.2f}'.format # $#,##0.00

format_integer = '{:,.0f}'.format # $#,##0

format_number = '{:,}'.format # $#,##0.#

format_percentage = '{:.2%}'.format # 0.00%

currency_series_to_string = partial(Series.to_string, float_format=format_currency)

percentage_series_to_string = partial(Series.to_string, float_format=format_percentage)

## Exercise 2 - Filtering

Return the following subsets of the dataframe.

1. How many listings are there with a price less than 100? 


2. Find how many listings there are in just Brooklyn.


3. Find how many listings there are in Brooklyn with a price less than 100.


4. Using `.isin()` select anyone that has the host name of Michael, David, John, and Daniel.


5. Create a new column called `adjusted_price` that has $100 added to every listing in Williamsburg.  The prices for all other listings should be the same as the were before. 


6. What % of the rooms are private, and what % of the rooms are shared.  
    * Hint, use `.value_counts()`


In [None]:
# 1. How many listings are there with a price less than 100? 

pipe(df.query('price < 100'),
     len,
     format_number,
     print)

In [None]:
# 2. Make a new DataFrame of listings in Brooklyn named `df_bk` 
# and find how many listings in just Brooklyn.

pipe(df.query('neighbourhood_group == "Brooklyn"'),
     len,
     format_number,
     print)

In [None]:
# 3. Find how many listings there are in Brooklyn with a price less than 100.

pipe(df.query('neighbourhood_group == "Brooklyn" & price < 100'),
     len,
     format_number,
     print)

In [None]:
# 4. Using `.isin()` select anyone that has the host name of Michael, David, John, and Daniel.

df[df.host_name.isin(['Michael', 'David', 'John', 'Daniel'])]

In [None]:
# 5. Create a new column called `adjusted_price` that has $100 added to every listing in Williamsburg.  
# The prices for all other listings should be the same as the were before. 

def illustrate_adjusted_price(df): # Select a small subset of data to illustrate the adjusted price.
    df_filtered = pipe(df[['id', 'neighbourhood', 'price', 'adjusted_price']], # Select the relevant columns.
                       partial(DataFrame.groupby, by='neighbourhood'), # Group the data by neighbourhoods so that each neighbourhood is represented once.
                       GroupBy.first, # Select the first row for each neighbourhood.
                       DataFrame.reset_index) # Reset to numeric index by demoting neighbourhood to regular column.
    return pipe(df_filtered.neighbourhood.eq('Williamsburg').idxmax(), # Index of Williamsburg in the filtered dataframe.
                lambda index: slice(max(index - 5, 0), # 5 rows before the Williamsburg row
                                    min(index + 6, len(df_filtered))), # 5 rows after the Williamsburg row
                df_filtered.get, # 11 rows centered on Williamsburg
                format_dataframe({'price': format_currency,
                                  'adjusted_price': format_currency}))

df['adjusted_price'] = np.where(df.neighbourhood == 'Williamsburg', df.price + 100, df.price) # Adjust price.
illustrate_adjusted_price(df)

In [None]:
# 6. What % of the rooms are private, and what % of the rooms are shared.  

pipe(df.room_type.value_counts(normalize=True), # Count the room types as a percentage of the total.
     percentage_series_to_string,
     print)

# Exercise 3 - Grouping

1. Using `groupby`, count how many listings are in each neighbourhood_group.


2. Using `groupby`, find the mean price for each of the neighbourhood_groups. 


3. Using `groupby` and `.agg()`, find the min and max price for each of the neighbourhood_groups. 


4. Using `groupby`, find the median price for each room type in each neighbourhood_group.


5. Using `groupby` and `.agg()`, find the count, min, max, mean, median, and std of the prices for each room type in each neighbourhood_group.

In [None]:
# 1. Using `groupby`, count how many listings are in each neighbourhood_group.

df_byneighbourhoods = df.groupby(by='neighbourhood_group')
df_byneighbourhoods.id \
                   .agg(['count']) \
                   .style.format({'count': format_number})

In [None]:
# 2. Using `groupby`, find the mean price for each of the neighbourhood_groups. 

pipe(df_byneighbourhoods.price.mean(),
     currency_series_to_string,
     print)

In [None]:
# 3. Using `groupby` and `.agg()`, find the min and max price for each of the neighbourhood_groups. 

df_byneighbourhoods.price \
                   .agg(['min', 'max']) \
                   .style.format({'min': format_currency,
                                  'max': format_currency})

In [None]:
# 4. Using `groupby`, find the mean price for each room type in each neighbourhood_group.

df_byneighbourhood_byroom = df.groupby(by=['neighbourhood_group', 'room_type'])
df_byneighbourhood_byroom.price \
                         .agg(['mean']) \
                         .style.format({'mean': format_currency})

In [None]:
# 5. Using `groupby` and `.agg()`, find the count, min, max, mean, median, and std of the prices 
# for each room type in each neighbourhood_group.

df_byneighbourhood_byroom.price \
                         .agg(['count', 'min', 'max', 'mean', 'median', 'std']) \
                         .style.format({'count': format_number,
                                        'min': format_currency,
                                        'max': format_currency,
                                        'mean': format_currency,
                                        'median': format_currency,
                                        'std': format_number})

# Join and file saving.
1. Load the `prices.csv` and the `n_listings.csv`


2. Do join that keeps all the records for each table.
    * Neighbourhood groups should include ['Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island',
       'LongIsland']
       
       
3. Save your joined csv as `joined.csv`


4. Load your saved table and see if it looks the same or different that the DataFrame you used to create it. 

In [None]:
# 1. Load the `prices.csv` and the `n_listings.csv`

df_prices = pd.read_csv('data/prices.csv',
                        sep=', ',
                        index_col='neighbourhood_group')
df_listings = pd.read_csv('data/n_listings.csv',
                          sep=';',
                          index_col='neighbourhood_group')
print(f'Loaded {len(df_prices)} rows from prices and {len(df_listings)} rows from listings.')

In [None]:
# 2. Do join that keeps all the records for each table.

df_prices_listings = df_prices.join(other=df_listings, how='outer')
df_prices_listings.style.format({'mean_price': format_currency,
                                 'n_listings': format_integer},
                                na_rep='')

In [None]:
# 3. Save your joined csv as joined.csv

df_prices_listings.to_csv('data/joined.csv')
del df_prices_listings

In [None]:
# 4. Load your saved table and see if it looks the same or different that the DataFrame you used to create it.
df_prices_listings = pd.read_csv('data/joined.csv', index_col='neighbourhood_group')
df_prices_listings.style.format({'mean_price': format_currency,
                                 'n_listings': format_integer},
                                na_rep='')

# Use the grammys.csv data for the next section of questions.

1. Who was won Album of the Year in 2016?


2. Who won Best Rap Album in 2009?


3. How many awards was Kendrick Lamar nomiated for, and how many did he win...?

In [None]:
#  1. Who was won Album of the Year in 2016?

df = pd.read_csv('data/grammys.csv')
df[(df.year == 2016)
 & (df.winner == True)
 & (df.category == 'Album of the Year')]

In [None]:
# 2. Who won Best Rap Album in 2009?

df[(df.year == 2009)
 & (df.winner == True)
 & (df.category == 'Best Rap Album')]

In [None]:
# 3. How many awards was Kendrick Lamar nomiated for, and how many did he win...?

df_Kendrick_Lamar_nominated = df[(df.nominee == 'Kendrick Lamar')
                               | (df.workers.str.contains('Kendrick Lamar'))]
df_Kendrick_Lamar_won = df_Kendrick_Lamar_nominated.query('winner')
print(f'Kendrick Lamar was nominated {len(df_Kendrick_Lamar_nominated)} times and won {len(df_Kendrick_Lamar_won)} times.')