# Configuring pandas

In [1]:
# import numpy and pandas
import numpy as np # NumPy는 행렬, 다차원 배열 지원하는 파이썬의 라이브러리, 데이터 구조, 수치 계산
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# The pandas Series

In [2]:
# create a four item Series
s = pd.Series([1, 2, 3, 4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [3]:
# get value at label 1
s[1]

2

In [4]:
# return a Series with the row with labels 1 and 3
s[[1, 3]]

1    2
3    4
dtype: int64

In [7]:
# create a series using an explicit index
s = pd.Series([1, 2, 3, 4], 
               index = ['a', 'b', 'c', 'd'])
s

a    1
b    2
c    3
d    4
dtype: int64

In [6]:
# look up items the series having index 'a' and 'd'
s[['a', 'd']]

a    1
d    4
dtype: int64

In [7]:
# passing a list of integers to a Series that has
# non-integer index labels will look up based upon
# 0-based index like an array
s[[1, 2]]

b    2
c    3
dtype: int64

In [8]:
# get only the index of the Series
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
# create a Series who's index is a series of dates
# between the two specified dates (inclusive)
dates = pd.date_range('2016-04-01', '2016-04-06')
dates

DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04',
               '2016-04-05', '2016-04-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
# create a Series with values (representing temperatures)
# for each date in the index
temps1 = pd.Series([80, 82, 85, 90, 83, 87], 
                   index = dates)
temps1

2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, dtype: int64

In [11]:
# what's the temperation for 2016-4-4?
temps1['2016-04-04']

90

In [12]:
# create a second series of values using the same index
temps2 = pd.Series([70, 75, 69, 83, 79, 77], 
                   index = dates)
# the following aligns the two by their index values
# and calculates the difference at those matching labels
temp_diffs = temps1 - temps2
temp_diffs

2016-04-01    10
2016-04-02     7
2016-04-03    16
2016-04-04     7
2016-04-05     4
2016-04-06    10
Freq: D, dtype: int64

In [13]:
# and also possible by integer position as if the 
# series was an array
temp_diffs[2]

16

In [14]:
# calculate the mean of the values in the Series
temp_diffs.mean()

9.0

# The pandas DataFrame

In [15]:
# create a DataFrame from the two series objects temp1 and temp2
# and give them column names
temps_df = pd.DataFrame(
            {'Missoula': temps1, 
             'Philadelphia': temps2})
temps_df

            Missoula  Philadelphia
2016-04-01        80            70
2016-04-02        82            75
2016-04-03        85            69
2016-04-04        90            83
2016-04-05        83            79
2016-04-06        87            77

In [16]:
# get the column with the name Missoula
temps_df['Missoula']

2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, Name: Missoula, dtype: int64

In [17]:
# likewise we can get just the Philadelphia column
temps_df['Philadelphia']

2016-04-01    70
2016-04-02    75
2016-04-03    69
2016-04-04    83
2016-04-05    79
2016-04-06    77
Freq: D, Name: Philadelphia, dtype: int64

In [18]:
# return both columns in a different order
temps_df[['Philadelphia', 'Missoula']]

            Philadelphia  Missoula
2016-04-01            70        80
2016-04-02            75        82
2016-04-03            69        85
2016-04-04            83        90
2016-04-05            79        83
2016-04-06            77        87

In [19]:
# retrieve the Missoula column through property syntax
temps_df.Missoula

2016-04-01    80
2016-04-02    82
2016-04-03    85
2016-04-04    90
2016-04-05    83
2016-04-06    87
Freq: D, Name: Missoula, dtype: int64

In [20]:
# calculate the temperature difference between the two cities
temps_df.Missoula - temps_df.Philadelphia

2016-04-01    10
2016-04-02     7
2016-04-03    16
2016-04-04     7
2016-04-05     4
2016-04-06    10
Freq: D, dtype: int64

In [21]:
# add a column to temp_df which contains the difference in temps
temps_df['Difference'] = temp_diffs
temps_df

            Missoula  Philadelphia  Difference
2016-04-01        80            70          10
2016-04-02        82            75           7
2016-04-03        85            69          16
2016-04-04        90            83           7
2016-04-05        83            79           4
2016-04-06        87            77          10

In [22]:
# get the columns, which is also an Index object
temps_df.columns

Index(['Missoula', 'Philadelphia', 'Difference'], dtype='object')

In [23]:
# slice the temp differences column for the rows at 
# location 1 through 4 (as though it is an array)
temps_df.Difference[1:4]

2016-04-02     7
2016-04-03    16
2016-04-04     7
Freq: D, Name: Difference, dtype: int64

In [24]:
# get the row at array position 1
temps_df.iloc[1]

Missoula        82
Philadelphia    75
Difference       7
Name: 2016-04-02 00:00:00, dtype: int64

In [25]:
# the names of the columns have become the index
# they have been 'pivoted'
temps_df.iloc[1].index

Index(['Missoula', 'Philadelphia', 'Difference'], dtype='object')

In [26]:
# retrieve row by index label using .loc
temps_df.loc['2016-04-05']

Missoula        83
Philadelphia    79
Difference       4
Name: 2016-04-05 00:00:00, dtype: int64

In [27]:
# get the values in the Differences column in tows 1, 3 and 5
# using 0-based location
temps_df.iloc[[1, 3, 5]].Difference

2016-04-02     7
2016-04-04     7
2016-04-06    10
Freq: 2D, Name: Difference, dtype: int64

In [28]:
# which values in the Missoula column are > 82?
temps_df.Missoula > 82

2016-04-01    False
2016-04-02    False
2016-04-03     True
2016-04-04     True
2016-04-05     True
2016-04-06     True
Freq: D, Name: Missoula, dtype: bool

In [29]:
# return the rows where the temps for Missoula > 82
temps_df[temps_df.Missoula > 82]

            Missoula  Philadelphia  Difference
2016-04-03        85            69          16
2016-04-04        90            83           7
2016-04-05        83            79           4
2016-04-06        87            77          10

# Loading data from a CSV file into a DataFrame

In [30]:
# display the contents of test1.csv
# which command to use depends on your OS
!head data/goog.csv # on non-windows systems
#!type data/test1.csv # on windows systems, all lines













In [8]:
# read the contents of the file into a DataFrame
direct = 'data/COVID 19 Cm data.csv'
df = pd.read_csv(direct)
df

        ID         Country  DateStart Dateend intended  ... Target city  \
0     163          Austria  16-Mar-20              NaN  ...         NaN   
1     132          Germany  01-Feb-20              NaN  ...         NaN   
2     578   United Kingdom  20-Mar-20              NaN  ...         NaN   
3     372   United Kingdom  16-Mar-20              NaN  ...         NaN   
4     357   United Kingdom  16-Mar-20              NaN  ...         NaN   
...    ...             ...        ...              ...  ...         ...   
1698  100     US:Louisiana  11-Mar-20              NaN  ...         NaN   
1699  100     US:Louisiana  13-Mar-20              NaN  ...         NaN   
1700  100     US:Louisiana  13-Mar-20              NaN  ...         NaN   
1701  100     US:Louisiana  13-Mar-20              NaN  ...         NaN   
1702  100     US:Louisiana  17-Mar-20              NaN  ...         NaN   

     Target country Target region Target state  
0               NaN           NaN          NaN  
1

In [9]:
# the contents of the date column
df.DateStart

0       16-Mar-20
1       01-Feb-20
2       20-Mar-20
3       16-Mar-20
4       16-Mar-20
          ...    
1698    11-Mar-20
1699    13-Mar-20
1700    13-Mar-20
1701    13-Mar-20
1702    17-Mar-20
Name: DateStart, Length: 1703, dtype: object

In [11]:
# we can get the first value in the date column
df.DateStart[0]

'16-Mar-20'

In [13]:
# it is a string
type(df.DateStart[0])

str

In [17]:
# read the data and tell pandas the date column should be 
# a date in the resulting DataFrame
direct = 'data/COVID 19 Cm data.csv'
df = pd.read_csv(direct, parse_dates = ['DateStart'])
df

        ID         Country  DateStart Dateend intended  ... Target city  \
0     163          Austria 2020-03-16              NaN  ...         NaN   
1     132          Germany 2020-02-01              NaN  ...         NaN   
2     578   United Kingdom 2020-03-20              NaN  ...         NaN   
3     372   United Kingdom 2020-03-16              NaN  ...         NaN   
4     357   United Kingdom 2020-03-16              NaN  ...         NaN   
...    ...             ...        ...              ...  ...         ...   
1698  100     US:Louisiana 2020-03-11              NaN  ...         NaN   
1699  100     US:Louisiana 2020-03-13              NaN  ...         NaN   
1700  100     US:Louisiana 2020-03-13              NaN  ...         NaN   
1701  100     US:Louisiana 2020-03-13              NaN  ...         NaN   
1702  100     US:Louisiana 2020-03-17              NaN  ...         NaN   

     Target country Target region Target state  
0               NaN           NaN          NaN  
1

In [19]:
# verify the type now is date
# in pandas, this is actually a Timestamp
type(df.DateStart[0])

pandas._libs.tslibs.timestamps.Timestamp

In [20]:
# unfortunately the index is numeric which makes
# accessing data by date more complicated
df.index

RangeIndex(start=0, stop=1703, step=1)

In [21]:
# read in again, now specity the data column as being the 
# index of the resulting DataFrame
df = pd.read_csv(direct, parse_dates = ['DateStart'], index_col = 'DateStart')
df

              ID         Country Dateend intended  \
DateStart                                           
2020-03-16  163          Austria              NaN   
2020-02-01  132          Germany              NaN   
2020-03-20  578   United Kingdom              NaN   
2020-03-16  372   United Kingdom              NaN   
2020-03-16  357   United Kingdom              NaN   
...          ...             ...              ...   
2020-03-11  100     US:Louisiana              NaN   
2020-03-13  100     US:Louisiana              NaN   
2020-03-13  100     US:Louisiana              NaN   
2020-03-13  100     US:Louisiana              NaN   
2020-03-17  100     US:Louisiana              NaN   

                           Description of measure implemented  ...  \
DateStart                                                      ...   
2020-03-16  On 10 March 2020 government announced that all...  ...   
2020-02-01  102 German citizens and 26 relatives, all of w...  ...   
2020-03-20        All schools,

In [22]:
# and the index is now a DatetimeIndex
df.index

DatetimeIndex(['2020-03-16', '2020-02-01', '2020-03-20', '2020-03-16',
               '2020-03-16', '2020-03-16', '2020-03-14', '2020-02-26',
               '2020-03-18', '2020-03-16',
               ...
               '2020-03-30', '2020-03-17', '2020-04-02', '2020-03-17',
               '2020-03-17', '2020-03-11', '2020-03-13', '2020-03-13',
               '2020-03-13', '2020-03-17'],
              dtype='datetime64[ns]', name='DateStart', length=1703, freq=None)

# Visualization