In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

plt.rc('figure', figsize=(11, 9))
plt.rc('font', size=13)

import requests
import os
from datetime import timedelta, datetime as dt

import warnings
warnings.filterwarnings("ignore")

from ts_acquire import get_store_data, opsd_germany_daily
from ts_prepare import hist_plot, numeric_hists

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

## Acquire Store

Use my imported function to read my csv file into a pandas DataFrame.

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

In [2]:
df = get_store_data()

In [3]:
df.head()

Unnamed: 0,sale_amount,sale_date,sale_id,store_address,store_city,store_id,store_state,store_zipcode,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
2,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
3,13.0,"Fri, 04 Jan 2013 00:00:00 GMT",4,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
4,10.0,"Sat, 05 Jan 2013 00:00:00 GMT",5,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 913000 entries, 0 to 912999
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   sale_amount    913000 non-null  float64
 1   sale_date      913000 non-null  object 
 2   sale_id        913000 non-null  int64  
 3   store_address  913000 non-null  object 
 4   store_city     913000 non-null  object 
 5   store_id       913000 non-null  int64  
 6   store_state    913000 non-null  object 
 7   store_zipcode  913000 non-null  int64  
 8   item_brand     913000 non-null  object 
 9   item_id        913000 non-null  int64  
 10  item_name      913000 non-null  object 
 11  item_price     913000 non-null  float64
 12  item_upc12     913000 non-null  int64  
 13  item_upc14     913000 non-null  int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 104.5+ MB


### 1. DateTime Format

- Convert date column to datetime format.

- I'm going to use the `format` parameter to speed up datetime conversion from `Tue, 01 Jan 2013 00:00:00 GMT` to `2013-01-01`.

[Here](https://zach.lol/strftime.html) is the link to Zach's awesome python datetime format specifier resource.

[Here](https://www.programiz.com/python-programming/datetime/strftime) is another nice resource for datetime formatting with a little bit of visual explanation I found useful.

In [5]:
df['sale_date'] = pd.to_datetime(df.sale_date, format='%a, %d %b %Y %H:%M:%S %Z').dt.strftime('%Y-%m-%d')
                                

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Just taking a look at item_price stats.

df.item_price.describe()

### 2. Plot Distributions

- Plot the distribution of sale_amount and item_price.

- I'm importing a function I created from my ts_prepare module called `hist_plot`.

In [None]:
hist_plot(df=df, col='item_price', unit_label='US $', bins=20)

In [None]:
hist_plot(df=df, col='sale_amount', unit_label='# of Units Sold in a Day at a Store', bins=20)

### 3. DateTimeIndex

- Set the index to be the datetime variable.

In [None]:
df = df.set_index('sale_date').sort_index()

In [None]:
# First day in df is Jan 1st, 2013

df.index.min()

In [None]:
# Last day in df is Dec 31, 2017

df.index.max()

### 4. Date Parts

Add a 'month' and 'day of week' column to your dataframe, derived from the index using the keywords for those date parts.


>**If you have upgraded your pandas, use `.day_name()`, if not, `.weekday_name`. To check your pandas version, `pd.__version__`**


In [None]:
df['month'] = df.index.month
df['weekday'] = df.index.day_name()
df.head(2)

### 5.  Using .assign()

Add a column to your dataframe, `sales_total`, which is a derived from `sale_amount` (total items) and `item_price`. I'll use `.assign()`, but it's not the only way. 

```python
df = df.assign(new_col_name = your calculation)
```

In [None]:
df = df.assign(sales_total = df.sale_amount * df.item_price)
df.head(3)

### 6. Build Functions

**Why use the category type instead of the object type? This is a personal preference here because I wanted the functionality of the category object for future use. If you're interested, I thought [this article](https://pbpython.com/pandas_dtypes_cat.html) did a great job at speaking to the topic.**

In [None]:
# I'm going to take care of changing some data types here

df = (df.astype({'sale_id': object, 
                 'store_id': object, 
                 'store_zipcode': object, 
                 'item_id': object, 
                 'item_upc12': object, 
                 'item_upc14': object, 
                 'month': 'category', 
                 'weekday': 'category'}))

In [None]:
df.info()

In [None]:
def numeric_hists(df, bins=20):
    """
    Function to take in a DataFrame, bins default 20,
    select only numeric dtypes, and
    display histograms for each numeric column
    """
    num_df = df.select_dtypes(include=np.number)
    num_df.hist(bins=bins, color='thistle')
    plt.suptitle('Numeric Column Distributions')
    plt.show()

In [None]:
numeric_hists(df, bins=30)

In [None]:
def prepped_store_df():
    """
    Function to acquire and prepare
    store dataframe and show
    distributions for numeric columns
    """
    # Acquire the df
    df = get_store_data()
    
    # Create DateTimeIndex
    df['sale_date'] = pd.to_datetime(df.sale_date, format='%a, %d %b %Y %H:%M:%S %Z').dt.strftime('%Y-%m-%d')
    df = df.set_index('sale_date').sort_index()
    
    # Create date part columns
    df['month'] = df.index.month
    df['weekday'] = df.index.day_name()
    
    # Create calculated columns
    df = df.assign(sales_total = df.sale_amount * df.item_price)
    df = df.assign(sales_diff = df.sales_total.diff(periods=1))
    
    # Change dtypes of numeric columns to object and category
    df = (df.astype({'sale_id': object, 
                     'store_id': object, 
                     'store_zipcode': object, 
                     'item_id': object, 
                     'item_upc12': object, 
                     'item_upc14': object, 
                     'month': 'category', 
                     'weekday': 'category'}))
    
    # Display distributions of numeric columns
    numeric_hists(df)
    
    return df

In [None]:
df = prepped_store_df()

In [None]:
df.head(2)

In [None]:
df.info()

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

## Acquire `opsd_df`

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

In [None]:
gdf = opsd_germany_daily()
gdf.head(2)

In [None]:
gdf.info()

### 1. DateTime Format

- Convert date column to datetime format. 

In [None]:
gdf['Date'] = pd.to_datetime(gdf.Date)

In [None]:
gdf.info()

### 2. Plot Distributions

- Plot the distribution of each of your variables. 

### 3. DateTimeIndex

- Set the index to be the datetime variable. 

In [None]:
gdf = gdf.set_index('Date').sort_index()

### 4. Date Parts

- Add a month and a year column to your dataframe.

In [None]:
# gdf['month'] = gdf.index.month.astype('category')
# gdf['year'] = gdf.index.year.astype('category')
gdf['month'] = gdf.index.month
gdf['year'] = gdf.index.year

In [None]:
gdf.head(2)

### 5. Build Function

In [None]:
def prepped_energy():
    """
    Function the acquires and returns 
    a prepared df for the OPS German Energy data
    and displays historgrams for numeric columns
    """
    # Acquire Datetime df
    gdf = german_energy_csv()
    
    # Create new date part columns as category dtypes
    gdf['month'] = gdf.index.month.astype('category')
    gdf['year'] = gdf.index.year.astype('category')
    
    # Plot numeric column distributions
    numeric_hists(gdf)
    
    return gdf

In [None]:
gdf = prepped_energy()

In [None]:
gdf.head(2)

In [None]:
gdf.info()

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

## Acquire `sf_temps`

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

In [None]:
from vega_datasets import data

sfdf = data.sf_temps()
sfdf.head(2)

### DateTimeIndex

In [None]:
# Convert sfdf to a DateTime Series df

sfdf['date'] = pd.to_datetime(sfdf.date)
sfdf = sfdf.set_index('date').sort_index()
sfdf.head(2)

In [None]:
sfdf.info()

In [None]:
numeric_hists(sfdf, bins=15)

### 1. `sfdf.resample('D')`

Resample by the day and take the average temperature. Visualize the average temperature over time.

>One awesome feature of a DateTimeIndex is simplicity in plotting, as matplotlib will automatically treat it as the x axis, so we don’t need to explicitly specify anything.

```python
datetimeindex_df.col.plot(kind='bar', color='thistle)
```

In [None]:
# resample by D and get average daily temp, (shift + option + 8 == degree symbol)

sfdf.resample('D').mean().plot(color='peru')

plt.title('San Francisco 2010 Average Daily Temperatures')
plt.ylabel('Temp in °F')
plt.show()

### 2. `.min()`

Write the code necessary to visualize the minimum temperature over time.

In [None]:
# resample by D and get minimum daily temp, (shift + option + 8 == degree symbol)

sfdf.resample('D').min().plot(color='lightskyblue')

plt.title('San Francisco 2010 Minimum Daily Temperatures')
plt.ylabel('Temp in °F')
plt.show()

### 3.  `.max()`

Write the code necessary to visualize the maximum temperature over time.

In [None]:
# resample by D and get maximum daily temp, (shift + option + 8 == degree symbol)

sfdf.resample('D').max().plot(color='crimson')

plt.title('San Francisco 2010 Maximum Daily Temperatures')
plt.ylabel('Temp in °F')
plt.show()

### 4. `.idxmin()`

Which month is the coldest, on average?

In [None]:
# Here I wanted to see the coldest month and the temp, so I used .loc to pull up observation by index

sfdf.loc[sfdf.resample('M').mean().idxmin()]
#sfdf.resample('M').mean().idxmin()

### 5. `.idxmax()`

Which month has the highest average temperature?

In [None]:
# Here I wanted to see the hotest month and the temp, so I used .loc to pull up observation by index

sfdf.loc[sfdf.resample('M').mean().idxmax()]

### 6. `.agg(['min', 'max'])`

Resample by the day and calculate the min and max temp for the day. Use this resampled dataframe to calculate the change in temperature for the day. Which month has the highest daily temperature variability?

- Hint: `.agg(['min', 'max'])`

In [None]:
# resample by day and get min and max temps in df

min_max = sfdf.resample('D').agg(['min', 'max'])
min_max.head()

In [None]:
# take care of multi-index of df

min_max.columns = ['min_temp', 'max_temp']
min_max.head()

In [None]:
# create temp_range column

min_max['temp_range'] = min_max.max_temp - min_max.min_temp
min_max.head()

In [None]:
# return min, max, and max temp_range for month with highest range

min_max.loc[min_max.temp_range.idxmax()]

### 7. Bonus: 

Visualize the daily min, average, and max temperature over time on a single line plot.

- i.e. the min, average, and maximum temperature should be 3 seperate lines.

In [None]:
# Create agg df

temp_agg = sfdf.resample('D').agg(['min', 'mean', 'max'])
temp_agg.head(2)

In [None]:
# Handle multi-index

temp_agg.columns = ['min_temp', 'mean_temp', 'max_temp']
temp_agg.head(2)

In [None]:
temp_agg.plot(color=['lightskyblue', 'peru', 'crimson'])

plt.legend()
plt.title('San Francisco 2010 Daily Temperature Breakdown')
plt.xlabel('')
plt.ylabel('Temperature in °F')
plt.show()

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

## Acquire `seattle_weather`

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

## Acquire `flights_20k`

<hr style="border-top: 10px groove lightcoral; margin-top: 1px; margin-bottom: 1px"></hr>

In [None]:
fdf = data.flights_20k()
fdf.head(2)

### Convert df to DateTime Index

In [None]:
# Convert sfdf to a DateTime Series df

fdf = fdf.set_index('date').sort_index()
fdf.head(2)

In [None]:
fdf.info()

In [None]:
# First datetime in dataset is January 1 at 1 AM

fdf.index.min()

In [None]:
# Last datetime in dataset is March 31 at 9:30

fdf.index.max()

In [None]:
# This is some skewed data!

numeric_hists(fdf)

### 1. Convert any negative delays to 0.

In [None]:
# Use original for quick visual check .where() is working.

fdf.head()

In [None]:
 # df.col = np.where(this_is_true, do_this, else_do_that)

fdf.delay = np.where(fdf.delay < 0, 0, fdf.delay)
fdf.head()

In [None]:
# Here I validate that I have replaced negative numbers in delay

fdf[fdf.delay < 0]

### 2. Which hour of the day has the highest average delay?

In [None]:
# I'm going to create a column for hour, so I can groupby hour

fdf['hour'] = fdf.index.hour.astype('category')
fdf.head(2)

In [None]:
# I groupby hour and find the mean value for delay for each hour
# I sort the values to find that 3 PM has the highest value for average delay

fdf.groupby('hour')[['delay']].mean().sort_values(by='delay', ascending=False)

### 3. Does the day of the week make a difference in the delay amount?

In [None]:
# Add column for the day of the week, so I can groupby weekday

fdf['weekday'] = fdf.index.day_name()
fdf.head(2)

In [None]:
# I groupby the weekday and examine the average delay

fdf.groupby('weekday')[['delay']].mean().sort_values('delay', ascending=False)

In [None]:
cats = ['Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
cats.reverse()

fdf.groupby('weekday')[['delay']].mean().reindex(cats).plot(kind='barh', color='thistle')
plt.title('Average Delay by Day of the Week')
plt.ylabel('')
plt.xlabel('Delay in Minutes')
plt.show()

In [None]:
# Interesting. I wanted to see the weekday/hour combo with the highest avg delay
# Friday at 3 PM tops the charts; people are flying home and away from home.

fdf.groupby(['weekday', 'hour'])[['delay']].mean().idxmax()

### 4. Does the month make a difference in the delay amount?

In [None]:
# Create a month column to groupby month. This dataset contains Jan, Feb, March

fdf['month'] = fdf.index.month.astype('category')
fdf.month.value_counts(dropna=False)

In [None]:
# I groupby month to examine the average delay by month
# It looks like February, typically the coldest month of the year in the N. Hemisphere,
# has slightly more delay time on average than January

fdf.groupby('month')[['delay']].mean().sort_values(by='delay', ascending=False)

### Just Curious Stuff...

In [None]:
# February at 3 PM had the highest average delay

fdf.groupby(['month', 'hour'])[['delay']].mean().sort_values(by='delay', ascending=False)

In [None]:
fdf.delay.plot(color='thistle')

plt.title('There Are Our Outliers')
plt.show()

In [None]:
fdf[fdf.month == 2].sort_values(by='delay', ascending=False)

In [None]:
# Here is our longest delay. The 0 value for delay makes the delay average for this month, day, time combo drop substantially. 

fdf.loc[fdf.delay.idxmax()]

In [None]:
cats = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
cats.reverse()

fdf.weekday.value_counts().reindex(cats).plot(kind='barh', color='thistle')

plt.ylabel('')
plt.xlabel('Number of Flights')
plt.xticks(rotation=0)
plt.title('Distribution of Flights by Weekday in Dataset')
plt.show()

In [None]:
fdf.weekday.value_counts(ascending=False)

In [None]:
# I was curious about store sales by location

df.groupby('store_address').sales_total.sum().plot(kind='barh', color='thistle')

plt.title('Total Sales by Store Address')
plt.ylabel('')
plt.xlabel('US Dollars')
plt.show()

In [None]:
# What were stores selling the most of?

df.groupby('store_address')[['sale_amount']].max().sort_values(by='sale_amount', ascending=False)

In [None]:
# What were stores selling the least of?

df.groupby('store_address')[['sale_amount']].min().sort_values(by='sale_amount', ascending=False)