### Reading multiple data files 

In [1]:
import pandas as pd
dataframe0 = pd.read_csv('sales/sales-jan-2015.csv')
dataframe1 = pd.read_csv('sales/sales-feb-2015.csv')
dataframe0.head(3)

Unnamed: 0,Date,Company,Product,Units
0,2015-01-21 19:13:21,Streeplex,Hardware,11
1,2015-01-09 05:23:51,Streeplex,Service,8
2,2015-01-06 17:19:34,Initech,Hardware,17


### Using a loop 

In [2]:
filenames = ['sales/sales-jan-2015.csv', 'sales/sales-feb-2015.csv']
dataframes = []
for f in filenames:
    dataframes.append(pd.read_csv(f))
#dataframes[0]

### Using a list comprension 

In [3]:
dataframes = [pd.read_csv(f) for f in filenames]
#dataframes

### Using glob

In [4]:
from glob import glob
filenames = glob('sales/sales*.csv')
dataframes = [pd.read_csv(f) for f in filenames]
#dataframes

### Reindexing DataFrames

- *indices*: many *index labels* within *Index Data structures*
- *indexes*: many pandas *Index data structures*

In [5]:
df = pd.read_csv('pittsburgh2013.csv', index_col="Date", parse_dates=True)

In [6]:
by_quarter = df.groupby(pd.PeriodIndex(df.index, freq="Q")).mean()
w_mean = by_quarter['Mean TemperatureF']
w_mean.index = ['Jan', 'Apr', 'Jul', 'Oct']
w_mean.index.name = 'Month'
w_mean

Month
Jan    32.133333
Apr    61.956044
Jul    68.934783
Oct    43.434783
Name: Mean TemperatureF, dtype: float64

In [7]:
by_quarter = df.groupby(pd.PeriodIndex(df.index, freq="Q")).max()
w_max = by_quarter['Max TemperatureF']
w_max.index = ['Jan', 'Apr', 'Jul', 'Oct']
w_max.index.name = 'Month'
w_max

Month
Jan    68
Apr    89
Jul    91
Oct    84
Name: Max TemperatureF, dtype: int64

In [8]:
w_mean.sort_index()

Month
Apr    61.956044
Jan    32.133333
Jul    68.934783
Oct    43.434783
Name: Mean TemperatureF, dtype: float64

In [9]:
w_mean.reindex(w_max.index)

Month
Jan    32.133333
Apr    61.956044
Jul    68.934783
Oct    43.434783
Name: Mean TemperatureF, dtype: float64

In [10]:
w_mean3 = w_mean.reindex(['Jan','Apr','Dec'])
w_mean3

Month
Jan    32.133333
Apr    61.956044
Dec          NaN
Name: Mean TemperatureF, dtype: float64

In [11]:
w_max.reindex(w_mean3.index)

Month
Jan    68.0
Apr    89.0
Dec     NaN
Name: Max TemperatureF, dtype: float64

In [12]:
w_max.reindex(w_mean3.index).dropna()

Month
Jan    68.0
Apr    89.0
Name: Max TemperatureF, dtype: float64

In [18]:
#Sorting DataFrame with the Index & columns
by_month = df.groupby(pd.PeriodIndex(df.index, freq="M")).max()
weather1 = by_month['Max TemperatureF']
weather1.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec']
weather1.index.name = 'Month'
weather1.head()

Month
Jan    68
Feb    60
Mar    68
Apr    84
May    88
Name: Max TemperatureF, dtype: int64

In [19]:
weather2 = weather1.sort_index()
weather2.head()

Month
Apr    84
Aug    86
Dec    68
Feb    60
Jan    68
Name: Max TemperatureF, dtype: int64

In [22]:
weather3 = weather1.sort_index(ascending=False)
weather3.head()

Month
Sep    90
Oct    84
Nov    72
May    88
Mar    68
Name: Max TemperatureF, dtype: int64

In [25]:
weather4 = weather1.sort_values()
weather4.head()

Month
Feb    60
Jan    68
Mar    68
Dec    68
Nov    72
Name: Max TemperatureF, dtype: int64

In [38]:
# Reindexing DataFrame from a list 
weather1 = pd.DataFrame(w_mean).sort_index()
weather1

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Apr,61.956044
Jan,32.133333
Jul,68.934783
Oct,43.434783


In [40]:
year = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct','Nov', 'Dec']
weather2 = weather1.reindex(year)
weather2.head()

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Feb,
Mar,
Apr,61.956044
May,


In [41]:
weather3 = weather1.reindex(year).ffill()
weather3

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Feb,32.133333
Mar,32.133333
Apr,61.956044
May,61.956044
Jun,61.956044
Jul,68.934783
Aug,68.934783
Sep,68.934783
Oct,43.434783


In [48]:
names_1981 = pd.read_csv('baby_names/names1981.csv', 
                         header=None, 
                         names=['name','gender','count'], 
                         index_col=(0,1))
names_1881 = pd.read_csv('baby_names/names1881.csv', 
                         header=None, 
                         names=['name','gender','count'], 
                         index_col=(0,1))
common_names = names_1981.reindex(names_1881.index)
common_names = common_names.dropna()
common_names.shape

(1587, 1)

In [51]:
weather = pd.read_csv('pittsburgh2013.csv',
                     index_col='Date', parse_dates=True)
weather.loc['2013-7-1':'2013-7-7', 'PrecipitationIn']

Date
2013-07-01    0.18
2013-07-02    0.14
2013-07-03    0.00
2013-07-04    0.25
2013-07-05    0.02
2013-07-06    0.06
2013-07-07    0.10
Name: PrecipitationIn, dtype: float64

### Scalar multiplication

In [52]:
weather.loc['2013-7-1':'2013-7-7', 'PrecipitationIn'] * 2.54

Date
2013-07-01    0.4572
2013-07-02    0.3556
2013-07-03    0.0000
2013-07-04    0.6350
2013-07-05    0.0508
2013-07-06    0.1524
2013-07-07    0.2540
Name: PrecipitationIn, dtype: float64

In [53]:
week1_range = weather.loc['2013-07-01':'2013-07-07',
                         ['Min TemperatureF', 'Max TemperatureF']]
week1_range

Unnamed: 0_level_0,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-07-01,66,79
2013-07-02,66,84
2013-07-03,71,86
2013-07-04,70,86
2013-07-05,69,86
2013-07-06,70,89
2013-07-07,70,77


In [54]:
week1_mean = weather.loc['2013-07-01':'2013-07-07',
                         'Mean TemperatureF']
week1_mean

Date
2013-07-01    72
2013-07-02    74
2013-07-03    78
2013-07-04    77
2013-07-05    76
2013-07-06    78
2013-07-07    72
Name: Mean TemperatureF, dtype: int64

In [55]:
week1_range.divide(week1_mean, axis='rows')

Unnamed: 0_level_0,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-07-01,0.916667,1.097222
2013-07-02,0.891892,1.135135
2013-07-03,0.910256,1.102564
2013-07-04,0.909091,1.116883
2013-07-05,0.907895,1.131579
2013-07-06,0.897436,1.141026
2013-07-07,0.972222,1.069444


In [56]:
week1_mean.pct_change() * 100

Date
2013-07-01         NaN
2013-07-02    2.777778
2013-07-03    5.405405
2013-07-04   -1.282051
2013-07-05   -1.298701
2013-07-06    2.631579
2013-07-07   -7.692308
Name: Mean TemperatureF, dtype: float64

In [57]:
bronze = pd.read_csv('summer_olympic_medals/bronze_top5.csv', index_col=0)
bronze

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1052.0
Soviet Union,584.0
United Kingdom,505.0
France,475.0
Germany,454.0


In [58]:
silver = pd.read_csv('summer_olympic_medals/silver_top5.csv', index_col=0)
silver

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1195.0
Soviet Union,627.0
United Kingdom,591.0
France,461.0
Italy,394.0


In [59]:
gold = pd.read_csv('summer_olympic_medals/gold_top5.csv', index_col=0)
gold

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,2088.0
Soviet Union,838.0
United Kingdom,498.0
Italy,460.0
Germany,407.0


In [60]:
bronze + silver

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,
Italy,
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


### Using a fill_value

In [61]:
bronze.add(silver, fill_value=0)

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,454.0
Italy,394.0
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [62]:
bronze + silver + gold

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,
Germany,
Italy,
Soviet Union,2049.0
United Kingdom,1594.0
United States,4335.0


In [63]:
bronze.add(silver, fill_value=0).add(gold, fill_value=0)

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,861.0
Italy,854.0
Soviet Union,2049.0
United Kingdom,1594.0
United States,4335.0


In [65]:
#Broadcasting in arithmetic formulas
weather = pd.read_csv('pittsburgh2013.csv', index_col='Date')
weather.head()

Unnamed: 0_level_0,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,Max Sea Level PressureIn,...,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-1-1,32,28,21,30,27,16,100,89,77,30.1,...,10,6,2,10,8,,0.0,8,Snow,277
2013-1-2,25,21,17,14,12,10,77,67,55,30.27,...,10,10,10,14,5,,0.0,4,,272
2013-1-3,32,24,16,19,15,9,77,67,56,30.25,...,10,10,10,17,8,26.0,0.0,3,,229
2013-1-4,30,28,27,21,19,17,75,68,59,30.28,...,10,10,6,23,16,32.0,0.0,4,,250
2013-1-5,34,30,25,23,20,16,75,68,61,30.42,...,10,10,10,16,10,23.0,0.21,5,,221


In [67]:
temps_f = weather[['Min TemperatureF','Mean TemperatureF', 'Max TemperatureF']]
temps_f.head()

Unnamed: 0_level_0,Min TemperatureF,Mean TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-1-1,21,28,32
2013-1-2,17,21,25
2013-1-3,16,24,32
2013-1-4,27,28,30
2013-1-5,25,30,34


In [71]:
temps_c = (temps_f - 32) * 5/9

In [74]:
temps_c.columns = temps_c.columns.str.replace('F','C')
temps_c.head()

Unnamed: 0_level_0,Min TemperatureC,Mean TemperatureC,Max TemperatureC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-1-1,-6.111111,-2.222222,0.0
2013-1-2,-8.333333,-6.111111,-3.888889
2013-1-3,-8.888889,-4.444444,0.0
2013-1-4,-2.777778,-2.222222,-1.111111
2013-1-5,-3.888889,-1.111111,1.111111


In [77]:
gdp = pd.read_csv('GDP/gdp_usa.csv', parse_dates=True, index_col='DATE')
gdp.head()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1
1947-10-01,260.3
1948-01-01,266.2


In [79]:
post2008 = gdp.loc['2008-01-01':]
post2008.tail()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
2015-04-01,17998.3
2015-07-01,18141.9
2015-10-01,18222.8
2016-01-01,18281.6
2016-04-01,18436.5


In [80]:
yearly = post2008.resample('A').last()
yearly

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
2008-12-31,14549.9
2009-12-31,14566.5
2010-12-31,15230.2
2011-12-31,15785.3
2012-12-31,16297.3
2013-12-31,16999.9
2014-12-31,17692.2
2015-12-31,18222.8
2016-12-31,18436.5


In [81]:
yearly['growth'] = yearly.pct_change() * 100
yearly

Unnamed: 0_level_0,VALUE,growth
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-12-31,14549.9,
2009-12-31,14566.5,0.11409
2010-12-31,15230.2,4.556345
2011-12-31,15785.3,3.644732
2012-12-31,16297.3,3.243524
2013-12-31,16999.9,4.311144
2014-12-31,17692.2,4.072377
2015-12-31,18222.8,2.999062
2016-12-31,18436.5,1.172707


In [82]:
sp500 = pd.read_csv('sp500.csv', parse_dates=True, index_col='Date')
exchange = pd.read_csv('exchange.csv', parse_dates=True, index_col='Date')

In [83]:
dollars = sp500[['Open', 'Close']]
dollars.head()

Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02,2058.899902,2058.199951
2015-01-05,2054.439941,2020.579956
2015-01-06,2022.150024,2002.609985
2015-01-07,2005.550049,2025.900024
2015-01-08,2030.609985,2062.139893


In [85]:
pounds = dollars.multiply(exchange['GBP/USD'], axis='rows')
pounds.head()

Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02,1340.364425,1339.90875
2015-01-05,1348.616555,1326.389506
2015-01-06,1332.51598,1319.639876
2015-01-07,1330.562125,1344.063112
2015-01-08,1343.268811,1364.126161
