# Configuring pandas

In [2]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

# bring in matplotlib for graphics
import matplotlib.pyplot as plt
%matplotlib inline

# Creating a DataFrame using NumPy function results

In [5]:
# create a DataFrame from a 2-d ndarray
# np.array 를 활룡하여 결과에 적합한 df라는 DataFrame을 작성합니다.
df = pd.DataFrame(np.array([[10, 11, 12], [20, 21, 22], [20, 21, 22]]),columns=['Missoula', 'Philadelphia', 'oo'])
df


   Missoula  Philadelphia  oo
0        10            11  12
1        20            21  22
2        20            21  22

In [11]:
df1 = df[:2]
df1

   Missoula  Philadelphia  oo    0
0        10            11  12  100
1        20            21  22  100

In [13]:
df1.loc[0] = 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [3]:
# specify column names

# np.array 를 활룡하여 결과에 적합한 데이터와 컬럼을 지정하여 df라는 DataFrame을 작성합니다.
df = pd.DataFrame(np.array([[70, 71], [90, 91]]),
                  columns=['Missoula', 'Philadelphia'])
df

   Missoula  Philadelphia
0        70            71
1        90            91

# Creating a DataFrame using a Python dictionary and pandas Series objects

# Creating a DataFrame from a CSV file

In [6]:
#sp500.csv 데이터를 활용하여 index_col = 'Symbol', usecols = 0, 2, 3, 7열을 활용하여 sp500 이름의 Dataframe을 작성합니다. 
# 파일 위치("../data/sp500.csv")
sp500 = pd.read_csv("../data/sp500.csv", 
                    index_col= 'Symbol',
                     usecols=[0, 2, 3, 7])
sp500

                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
ABT                Health Care   39.60      15.573
ABBV               Health Care   53.95       2.954
ACN     Information Technology   79.79       8.326
ACE                 Financials  102.91      86.897
...                        ...     ...         ...
YHOO    Information Technology   35.02      12.768
YUM     Consumer Discretionary   74.77       5.147
ZMH                Health Care  101.84      37.181
ZION                Financials   28.43      30.191
ZTS                Health Care   30.53       2.150

[500 rows x 3 columns]

In [9]:
# how many rows of data?  Should be 500
# sp500 Dataframe의 총 행의 개수를 계산합니다.
len(sp500)

500

# Selecting columns of a DataFrame

# Selecting rows of a DataFrame

In [10]:
# rows with label MMM and MSFT
# this is a DataFrame result
# 행의 레이블이 ACN, YHOO인 데이터를 Dataframe 결과로 추출합니다.(loc)
sp500.loc[['ACN', 'YHOO']]

                        Sector   Price  Book Value
Symbol                                            
MMM                Industrials  141.14      26.668
MSFT    Information Technology   40.12      10.584

In [11]:
# get the location of MMM and A in the index
i1 = sp500.index.get_loc('MMM') # 'MMM', 'A' 의 위치를 각각 i1, i2 객체에 대입한 실행 결과를 추출합니다. (get_loc 함수)
i2 = sp500.index.get_loc('A')
(i1, i2)

(0, 10)

# Scalar lookup by label or location using .at[] and .iat[] 

In [12]:
# by label in both the index and column # 행은 'MMM', 컬럼은 'Price' 일치하는 데이터를 추출합니다. (at)
sp500.at['MMM', 'Price']

141.14

# Slicing using the [] operator

In [13]:
# ABT through ACN labels
sp500['ABT':'ACN']  # sp500 Dataframe에서 'ABT':'ACN' 레이블의 행 데이터를 출력합니다.([  ])

                        Sector  Price  Book Value
Symbol                                           
ABT                Health Care  39.60      15.573
ABBV               Health Care  53.95       2.954
ACN     Information Technology  79.79       8.326

# Selecting rows using Boolean selection

In [14]:
# now get the rows with Price < 100
sp500[sp500.Price < 100]  # sp500 Dataframe에서 Price 컬럼에서 100보다 작은 데이터를 출력합니다. ([   ])

                        Sector  Price  Book Value
Symbol                                           
ABT                Health Care  39.60      15.573
ABBV               Health Care  53.95       2.954
ACN     Information Technology  79.79       8.326
ADBE    Information Technology  64.30      13.262
AES                  Utilities  13.61       5.781
...                        ...    ...         ...
XYL                Industrials  38.42      12.127
YHOO    Information Technology  35.02      12.768
YUM     Consumer Discretionary  74.77       5.147
ZION                Financials  28.43      30.191
ZTS                Health Care  30.53       2.150

[407 rows x 3 columns]

In [15]:
# price > 100 and in the Health Care Sector
r = sp500[(sp500.Sector == 'Health Care') & 
          (sp500.Price > 100.00)] [['Price', 'Sector']]  
# sp500 Dataframe에서 Sector 컬럼은 'Health Care'이고 Price 컬럼은 100보다 작은 데이터를 추출하는 'r'이라는 DateFrame을 작성 

r

         Price       Sector
Symbol                     
ACT     213.77  Health Care
ALXN    162.30  Health Care
AGN     166.92  Health Care
AMGN    114.33  Health Care
BCR     146.62  Health Care
...        ...          ...
REGN    297.77  Health Care
TMO     115.74  Health Care
WAT     100.54  Health Care
WLP     108.82  Health Care
ZMH     101.84  Health Care

[19 rows x 2 columns]

# Selecting across both rows and columns

In [56]:
# select the price and sector columns for ABT and ZTS
sp500.loc[['ABT', 'ZTS']][['Sector', 'Price']] 
# sp500 DataFrame 에서 행은 'ABT', 'ZTS', 컬럼은 'Sector', 'Price'인 데이터를 추출합니다.(loc)

             Sector  Price
Symbol                    
ABT     Health Care  39.60
ZTS     Health Care  30.53