In [1]:
import pandas as pd
import numpy as np

# Text Methods

In [2]:
mystring = 'hello'
mystring.capitalize()

'Hello'

In [3]:
mystring.isdigit()

False

In [4]:
#help(str)

## Text Methods on Pandas String Column

Pandas can do a lot more than what we show here. Full online documentation on things like advanced string indexing and regular expressions with pandas can be found here: https://pandas.pydata.org/docs/user_guide/text.html

In [5]:
names = pd.Series(['andrew','bobo','claire','david','4'])
names

0    andrew
1      bobo
2    claire
3     david
4         4
dtype: object

In [6]:
names.str.capitalize()

0    Andrew
1      Bobo
2    Claire
3     David
4         4
dtype: object

In [7]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

## Splitting , Grabbing, and Expanding

In [8]:
tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']
len(tech_finance)

2

In [9]:
tickers = pd.Series(tech_finance)
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [10]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [11]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [12]:
tickers.str.split(',',expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


## Cleaning or Editing Strings

In [13]:
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
messy_names

0      andrew  
1         bo;bo
2      claire  
dtype: object

In [14]:
messy_names.str.replace(";","")

0      andrew  
1          bobo
2      claire  
dtype: object

In [15]:
messy_names.str.strip()

0    andrew
1     bo;bo
2    claire
dtype: object

In [16]:
messy_names.str.replace(";","").str.strip()

0    andrew
1      bobo
2    claire
dtype: object

In [17]:
messy_names.str.replace(";","").str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

## Alternative with Custom apply() call

In [18]:
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name

In [19]:
names = messy_names.apply(cleanup)
names

0    Andrew
1      Bobo
2    Claire
dtype: object

## Time Methods for Date and Time Data

In [20]:
from datetime import datetime

In [21]:
# Converting Series to datetime using pandas
myser = pd.Series(['Nov 3, 1990', '1998-02-13', None])
myser

0    Nov 3, 1990
1     1998-02-13
2           None
dtype: object

In [23]:
timeser = pd.to_datetime(myser)
timeser

0   1990-11-03
1   1998-02-13
2          NaT
dtype: datetime64[ns]

In [24]:
timeser[1].year

1998

In [25]:
euro_date = '31-12-2000'
pd.to_datetime(euro_date)

  pd.to_datetime(euro_date)


Timestamp('2000-12-31 00:00:00')

In [26]:
# 10th of Dec OR 12th of October?
# We may need to tell pandas

euro_date = '10-12-2000'

# to make sure the day comes first
pd.to_datetime(euro_date, dayfirst=True)

Timestamp('2000-10-12 00:00:00')

### Custom Time String Formatting

Sometimes dates can have a non standard format, luckily you can always specify to pandas the format. You should also note this could speed up the conversion, so it may be worth doing even if pandas can parse on its own. A full table of codes can be found here: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes

In [27]:
style_date = '12--Dec--2000'
pd.to_datetime(style_date, format='%d--%b--%Y')

Timestamp('2000-12-12 00:00:00')

In [28]:
strange_date = '12th of Dec 2000'
pd.to_datetime(strange_date)

Timestamp('2000-12-12 00:00:00')

### Data

Retail Sales: Beer, Wine, and Liquor Stores

Units:  Millions of Dollars, Not Seasonally Adjusted

Frequency:  Monthly


U.S. Census Bureau, Retail Sales: Beer, Wine, and Liquor Stores [MRTSSM4453USN], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/MRTSSM4453USN, July 2, 2020.

In [29]:
sales = pd.read_csv('../Data/RetailSales_BeerWineLiquor.csv')
sales.head()

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822


In [30]:
sales.iloc[0]['DATE']

'1992-01-01'

In [31]:
type(sales.iloc[0]['DATE'])

str

In [32]:
# Convert the date column to datetime
sales['DATE'] = pd.to_datetime(sales['DATE'])
sales.head()

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822


In [33]:
sales.iloc[0]['DATE']

Timestamp('1992-01-01 00:00:00')

In [34]:
type(sales.iloc[0]['DATE'])

pandas._libs.tslibs.timestamps.Timestamp

#### Attempt to Parse Dates Automatically

In [35]:
sales = pd.read_csv('../Data/RetailSales_BeerWineLiquor.csv', parse_dates=[0])
sales.head()

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822


In [37]:
type(sales.iloc[0]['DATE'])

pandas._libs.tslibs.timestamps.Timestamp

#### Resample


A common operation with time series data is resampling based on the time series index. Let's see how to use the resample() method. [[reference](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.resample.html)]

In [38]:
# Our index
sales.index

RangeIndex(start=0, stop=340, step=1)

In [39]:
# Reset DATE to index
sales = sales.set_index("DATE")
sales.head()

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-01-01,1509
1992-02-01,1541
1992-03-01,1597
1992-04-01,1675
1992-05-01,1822


In [40]:
# Yearly Means
sales.resample(rule='A').mean()

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-12-31,1807.25
1993-12-31,1794.833333
1994-12-31,1841.75
1995-12-31,1833.916667
1996-12-31,1929.75
1997-12-31,2006.75
1998-12-31,2115.166667
1999-12-31,2206.333333
2000-12-31,2375.583333
2001-12-31,2468.416667


Resampling rule 'A' takes all of the data points in a given year, applies the aggregation function (in this case we calculate the mean), and reports the result as the last day of that year. Note 2020 in this data set was not complete.

When calling `.resample()` you first need to pass in a **rule** parameter, then you need to call some sort of aggregation function.

The **rule** parameter describes the frequency with which to apply the aggregation function (daily, monthly, yearly, etc.)<br>
It is passed in using an "offset alias" - refer to the table below. [[reference](http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases)]

The aggregation function is needed because, due to resampling, we need some sort of mathematical rule to join the rows (mean, sum, count, etc.)

<table style="display: inline-block">
    <caption style="text-align: center"><strong>TIME SERIES OFFSET ALIASES</strong></caption>
<tr><th>ALIAS</th><th>DESCRIPTION</th></tr>
<tr><td>B</td><td>business day frequency</td></tr>
<tr><td>C</td><td>custom business day frequency (experimental)</td></tr>
<tr><td>D</td><td>calendar day frequency</td></tr>
<tr><td>W</td><td>weekly frequency</td></tr>
<tr><td>M</td><td>month end frequency</td></tr>
<tr><td>SM</td><td>semi-month end frequency (15th and end of month)</td></tr>
<tr><td>BM</td><td>business month end frequency</td></tr>
<tr><td>CBM</td><td>custom business month end frequency</td></tr>
<tr><td>MS</td><td>month start frequency</td></tr>
<tr><td>SMS</td><td>semi-month start frequency (1st and 15th)</td></tr>
<tr><td>BMS</td><td>business month start frequency</td></tr>
<tr><td>CBMS</td><td>custom business month start frequency</td></tr>
<tr><td>Q</td><td>quarter end frequency</td></tr>
<tr><td></td><td><font color=white>intentionally left blank</font></td></tr></table>

<table style="display: inline-block; margin-left: 40px">
<caption style="text-align: center"></caption>
<tr><th>ALIAS</th><th>DESCRIPTION</th></tr>
<tr><td>BQ</td><td>business quarter endfrequency</td></tr>
<tr><td>QS</td><td>quarter start frequency</td></tr>
<tr><td>BQS</td><td>business quarter start frequency</td></tr>
<tr><td>A</td><td>year end frequency</td></tr>
<tr><td>BA</td><td>business year end frequency</td></tr>
<tr><td>AS</td><td>year start frequency</td></tr>
<tr><td>BAS</td><td>business year start frequency</td></tr>
<tr><td>BH</td><td>business hour frequency</td></tr>
<tr><td>H</td><td>hourly frequency</td></tr>
<tr><td>T, min</td><td>minutely frequency</td></tr>
<tr><td>S</td><td>secondly frequency</td></tr>
<tr><td>L, ms</td><td>milliseconds</td></tr>
<tr><td>U, us</td><td>microseconds</td></tr>
<tr><td>N</td><td>nanoseconds</td></tr></table>

### .dt Method Calls

Once a column or index is ina  datetime format, you can call a variety of methods off of the .dt library inside pandas:

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.html

In [41]:
sales = sales.reset_index()
sales.head()

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822


In [42]:
sales['DATE'].dt.month

0       1
1       2
2       3
3       4
4       5
       ..
335    12
336     1
337     2
338     3
339     4
Name: DATE, Length: 340, dtype: int64

In [43]:
sales['DATE'].dt.is_leap_year

0       True
1       True
2       True
3       True
4       True
       ...  
335    False
336     True
337     True
338     True
339     True
Name: DATE, Length: 340, dtype: bool