In [8]:
import pandas as pd

In [None]:
# file paths
pokemon_path = "PANDAS IN ACTION/Chapter 3 Series methods/Data/pokemon.csv"
google_stocks_path = "PANDAS IN ACTION/Chapter 3 Series methods/Data/google_stocks.csv"
revolutionary_war_path = "PANDAS IN ACTION/Chapter 3 Series methods/Data/revolutionary_war.csv"

#### Import csv files

In [10]:
# these two lines are equivalent
pd.read_csv(filepath_or_buffer = google_stocks_path)
pd.read_csv(google_stocks_path)

Unnamed: 0,Date,Close
0,2004-08-19,49.98
1,2004-08-20,53.95
2,2004-08-23,54.50
3,2004-08-24,52.24
4,2004-08-25,52.80
...,...,...
3819,2019-10-21,1246.15
3820,2019-10-22,1242.80
3821,2019-10-23,1259.13
3822,2019-10-24,1260.99


In [11]:
# let's set the index column to be "Pokemon"
pd.read_csv(filepath_or_buffer = pokemon_path,index_col = "Pokemon")

Unnamed: 0_level_0,Type
Pokemon,Unnamed: 1_level_1
Bulbasaur,Grass / Poison
Ivysaur,Grass / Poison
Venusaur,Grass / Poison
Charmander,Fire
Charmeleon,Fire
...,...
Stakataka,Rock / Steel
Blacephalon,Fire / Ghost
Zeraora,Electric
Meltan,Steel


In [12]:
pokemon = pd.read_csv(pokemon_path, index_col = "Pokemon")

In [13]:
pokemon.head()


Unnamed: 0_level_0,Type
Pokemon,Unnamed: 1_level_1
Bulbasaur,Grass / Poison
Ivysaur,Grass / Poison
Venusaur,Grass / Poison
Charmander,Fire
Charmeleon,Fire


In [14]:
google = pd.read_csv(google_stocks_path, parse_dates = ["Date"], index_col=["Date"])
google.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2004-08-19,49.98
2004-08-20,53.95
2004-08-23,54.5
2004-08-24,52.24
2004-08-25,52.8


In [15]:
battles = pd.read_csv(revolutionary_war_path, index_col=["Start Date"], parse_dates=["Start Date"], usecols=['State', 'Start Date'])
battles.head()

Unnamed: 0_level_0,State
Start Date,Unnamed: 1_level_1
1774-09-01,Massachusetts
1774-12-14,New Hampshire
1775-04-19,Massachusetts
1775-04-19,Massachusetts
1775-04-20,Virginia


In [16]:
# convert dataframes to series
google_close = google["Close"]
battles_state = battles["State"]
pokemon_type = pokemon["Type"]

In [17]:
type(pokemon_type)

pandas.core.series.Series

#### Sorting a Series

##### Sorting by Values

In [18]:
google_close.sort_values().head()

Date
2004-09-03    49.82
2004-09-01    49.94
2004-08-19    49.98
2004-09-02    50.57
2004-09-07    50.60
Name: Close, dtype: float64

In [19]:
google_close.sort_values(ascending = False).head()

Date
2019-04-29    1287.58
2019-04-26    1272.18
2018-07-26    1268.33
2019-10-25    1265.13
2019-04-23    1264.55
Name: Close, dtype: float64

In [20]:
# Sorting a Series with Upper and Lower Case Strings. First capital letters are sorted before lowercase letters.
pd.Series(data = ["Adam", "adam", "Ben"]).sort_values()

0    Adam
2     Ben
1    adam
dtype: object

In [21]:
# sort Series with na_position parameter: last  - nan will be at the bottom
battles_state.sort_values(na_position="last").tail()

Start Date
1782-08-08    NaN
1782-08-25    NaN
1782-09-13    NaN
1782-10-18    NaN
1782-12-06    NaN
Name: State, dtype: object

In [22]:
# sort Series with na_position parameter: first - nan will be on top
battles_state.sort_values(na_position="first").head()

Start Date
1775-09-17    NaN
1775-12-31    NaN
1776-03-03    NaN
1776-03-25    NaN
1776-05-18    NaN
Name: State, dtype: object

In [23]:
# remove nan values by dropna() method
battles_state.dropna().sort_values().head()

Start Date
1781-09-06    Connecticut
1779-07-05    Connecticut
1777-04-27    Connecticut
1777-09-03       Delaware
1777-05-17        Florida
Name: State, dtype: object

##### Sorting by Index

In [24]:
# sorting by index values
pokemon_type.sort_index().head()

Pokemon
Abomasnow      Grass / Ice
Abra               Psychic
Absol                 Dark
Accelgor               Bug
Aegislash    Steel / Ghost
Name: Type, dtype: object

In [25]:
# sorting date by index will sort it chronologically
battles_state.sort_index().head()

Start Date
1774-09-01    Massachusetts
1774-12-14    New Hampshire
1775-04-19    Massachusetts
1775-04-19    Massachusetts
1775-04-20         Virginia
Name: State, dtype: object

In [26]:
# sorting date by index with na_position parameter will return NaT - not a time - first
battles_state.sort_index(na_position='first').head()

Start Date
NaT              New Jersey
NaT                Virginia
NaT                     NaN
NaT                     NaN
1774-09-01    Massachusetts
Name: State, dtype: object

#### Retrieving the Smallest and Largest Values with the nsmallest and nlargest Methods

In [27]:
google_close.sort_values(ascending=False).head()

Date
2019-04-29    1287.58
2019-04-26    1272.18
2018-07-26    1268.33
2019-10-25    1265.13
2019-04-23    1264.55
Name: Close, dtype: float64

##### nlargest

In [28]:
# return n largest values two same ways
google_close.nlargest(n = 5)


Date
2019-04-29    1287.58
2019-04-26    1272.18
2018-07-26    1268.33
2019-10-25    1265.13
2019-04-23    1264.55
Name: Close, dtype: float64

In [29]:
# n has default value of 5
google_close.nlargest()

Date
2019-04-29    1287.58
2019-04-26    1272.18
2018-07-26    1268.33
2019-10-25    1265.13
2019-04-23    1264.55
Name: Close, dtype: float64

##### nsmallest

In [30]:
# return n largest value
google_close.nsmallest(n = 5)

Date
2004-09-03    49.82
2004-09-01    49.94
2004-08-19    49.98
2004-09-02    50.57
2004-09-07    50.60
Name: Close, dtype: float64

In [33]:
# n has default value of 5
google_close.nsmallest()

Date
2004-09-03    49.82
2004-09-01    49.94
2004-08-19    49.98
2004-09-02    50.57
2004-09-07    50.60
Name: Close, dtype: float64

#### Overwriting a Series with the inplace Parameter

In [59]:
battles_state.sort_values()

Start Date
1777-04-27    Connecticut
1779-07-05    Connecticut
1781-09-06    Connecticut
1777-09-03       Delaware
1778-06-30        Florida
                 ...     
1776-03-03            NaN
1775-12-31            NaN
1775-09-17            NaN
NaT                   NaN
NaT                   NaN
Name: State, Length: 232, dtype: object

In [65]:
battles_state.head()

Start Date
1783-01-22    Virginia
1782-12-06         NaN
1782-10-18         NaN
1782-09-13         NaN
1782-09-11    Virginia
Name: State, dtype: object

In [None]:
# To get a sorted Dataframe by values and save it - use sort_values() and inplace=True
battles.sort_values(by='State', ascending=True, inplace=True)
battles.head()

In [66]:
# To get a sorted Series by Index and save it - use sort_index() and inplace=True
battles_state.sort_index(ascending=False, inplace=True)
battles_state.head()

Start Date
1783-01-22    Virginia
1782-12-06         NaN
1782-10-18         NaN
1782-09-13         NaN
1782-09-11    Virginia
Name: State, dtype: object

**Memorize that if you change the column(Series) of Dataframe using inplace=true it will affect to the Dataframe itself**

##### Counting Values with the value_counts Method

In [73]:
# value_counts() method returns a number in a Series for each unique value
pokemon_type.value_counts().head()

Type
Normal     65
Water      61
Grass      38
Psychic    35
Fire       30
Name: count, dtype: int64

In [75]:
# let's see count the length of the value_counts result
len(pokemon_type.value_counts())


159

In [76]:
# check it with nunique() method
pokemon_type.nunique()

159

In [None]:
# ascending parameter has default value of False, so ascending=True will return the smallest counts first
pokemon_type.value_counts(ascending=True).head()

Type
Fire / Ghost        1
Fighting / Dark     1
Fighting / Steel    1
Normal / Ground     1
Fire / Psychic      1
Name: count, dtype: int64

In [79]:
# normalize parameter will return the relative frequencies of unique values - how often each unique value occurs divided by the total number of values
pokemon_type.value_counts(normalize=True).head()

Type
Normal     0.080346
Water      0.075402
Grass      0.046972
Psychic    0.043263
Fire       0.037083
Name: proportion, dtype: float64

In [None]:
# visualize the result of value_counts() in percents   
# Normal make up 8.03% of all Pokemon types 
(pokemon_type.value_counts(normalize=True)* 100).round(2)

Type
Normal                8.03
Water                 7.54
Grass                 4.70
Psychic               4.33
Fire                  3.71
                      ... 
Fire / Psychic        0.12
Normal / Ground       0.12
Psychic / Fighting    0.12
Dark / Ghost          0.12
Fire / Ghost          0.12
Name: proportion, Length: 159, dtype: float64

In [None]:
# Use bins parameter to sort continuous values into discrete intervals 
# 1568 values in the Series fall into the 200 - 400 range
buckets = [0, 200, 400, 600, 800, 1000, 1200, 1400]
google_close.value_counts(bins = buckets)

(200.0, 400.0]      1568
(-0.001, 200.0]      595
(400.0, 600.0]       575
(1000.0, 1200.0]     406
(600.0, 800.0]       380
(800.0, 1000.0]      207
(1200.0, 1400.0]      93
Name: count, dtype: int64

In [91]:
# retrieve values where 200 < value <= 400
google_close[(google_close > 200) & (google_close <= 400)].head()

Date
2005-11-17    200.97
2005-11-21    203.92
2005-11-22    207.46
2005-11-23    210.64
2005-11-25    213.51
Name: Close, dtype: float64

In [None]:
# return the same result order by index from min to max
google_close.value_counts(bins=buckets).sort_index()

(-0.001, 200.0]      595
(200.0, 400.0]      1568
(400.0, 600.0]       575
(600.0, 800.0]       380
(800.0, 1000.0]      207
(1000.0, 1200.0]     406
(1200.0, 1400.0]      93
Name: count, dtype: int64

In [94]:
# The same result can be achieved by using the value_counts() and sort=False parameters
google_close.value_counts(bins=buckets, sort=False)

(-0.001, 200.0]      595
(200.0, 400.0]      1568
(400.0, 600.0]       575
(600.0, 800.0]       380
(800.0, 1000.0]      207
(1000.0, 1200.0]     406
(1200.0, 1400.0]      93
Name: count, dtype: int64

In [96]:
# use integer value for bins parameter to create 4 equal-width bins
google_close.value_counts(bins=4, sort=False)

(48.581, 359.26]     2086
(359.26, 668.7]       729
(668.7, 978.14]       495
(978.14, 1287.58]     514
Name: count, dtype: int64

**Find how many battles were in USA**

In [None]:
# The result without Nans
battles_state.value_counts().head()

State
South Carolina    31
New York          28
New Jersey        24
Virginia          21
Massachusetts     11
Name: count, dtype: int64

In [None]:
# the result with Nans
battles_state.value_counts(dropna=False).head()

State
NaN               70
South Carolina    31
New York          28
New Jersey        24
Virginia          21
Name: count, dtype: int64

**Use the Index object to find out the max number of battles by date**

In [101]:
battles_state.index.value_counts().head

<bound method NDFrame.head of Start Date
1777-08-22    2
1781-04-15    2
1782-03-16    2
1778-09-07    2
1781-05-22    2
             ..
1780-07-21    1
1780-07-20    1
1780-07-12    1
1780-06-23    1
1774-09-01    1
Name: count, Length: 217, dtype: int64>

##### Apply method

The `apply` method allows you to apply custom functions to each element in the Series.

In [None]:
# each element is rounded to the nearest integer
google_close.apply(round).head()

Date
2004-08-19    50
2004-08-20    54
2004-08-23    54
2004-08-24    52
2004-08-25    53
Name: Close, dtype: int64

In [None]:
# we do not modify the original Series unless we assign it back
google_close.head()

Date
2004-08-19    49.98
2004-08-20    53.95
2004-08-23    54.50
2004-08-24    52.24
2004-08-25    52.80
Name: Close, dtype: float64

Find the single or multiple type for each value of Pokemon Series

In [105]:
pokemon_type.head()

Pokemon
Bulbasaur     Grass / Poison
Ivysaur       Grass / Poison
Venusaur      Grass / Poison
Charmander              Fire
Charmeleon              Fire
Name: Type, dtype: object

In [106]:
# Create a function to find the single or multiple type for each value of Pokemon Series
def single_or_multi(pokemon_type):
    if '/' in pokemon_type:
        return 'Multi'
    else:
        return 'Single'

In [107]:
pokemon_type.apply(single_or_multi).head()

Pokemon
Bulbasaur      Multi
Ivysaur        Multi
Venusaur       Multi
Charmander    Single
Charmeleon    Single
Name: Type, dtype: object

Count the single and multi types

In [108]:
pokemon_type.apply(single_or_multi).value_counts()

Type
Multi     405
Single    404
Name: count, dtype: int64

Task -which day of the week had the highest number of battles during the American Revolutionary War.

**strftime("%A") method to get the day of the week from a datetime object**

In [138]:
# strftime("%A") method to get the day of the week from a datetime object
import datetime as dt
today = dt.datetime(2020, 12, 26)
today.strftime("%A")

'Saturday'

In [None]:
# create a new Series day_of_war from battles DataFrame
day_of_war = battles = pd.read_csv(revolutionary_war_path, parse_dates=["Start Date"], usecols=['Start Date'])
day_of_war.head()

Unnamed: 0,Start Date
0,1774-09-01
1,1774-12-14
2,1775-04-19
3,1775-04-19
4,1775-04-20


In [None]:
# extract the "Start Date" column as a Series
day_of_war=day_of_war["Start Date"]

In [None]:
# check the type of day_of_war Series
type(day_of_war)

pandas.core.series.Series

In [139]:
# define a function to get the day of the week from a datetime object
def day_of_week(date):
    return date.strftime("%A")

In [None]:
# Find which day of the week had the highest number of battles during the American Revolutionary War.
day_of_war.dropna().apply(day_of_week).value_counts()

Start Date
Saturday     39
Friday       39
Wednesday    32
Thursday     31
Sunday       31
Tuesday      29
Monday       27
Name: count, dtype: int64