In [6]:
'''Video 1 - Reading multiple data files
How to read multiple files into a collection of dataframes
'''

#tools for pandas data import
#used read_csv so far
#has 50 calling params
#other similar tools:
    #pd.read_excel()
    #pd.read_html()
    #pd.read_json()

#load data into their seperate and distinct dataframes
import pandas as pd
dataframe0 = pd.read_csv('sales-jan-2015.csv')
dataframe1 = pd.read_csv('sales-feb-2015.csv')

#it's easier to iterate over a collection of file names
filenames = ['sales-jan-2015.csv', 'sales-feb-2015.csv']
dataframes = []
for f in filenames:
    dataframes.append(pd.read_csv(f))

len(dataframes)

#can do the above loop in a list comprehension
dataframes1 = [pd.read_csv(f) for f in filenames]
list(dataframes1)

#when many filenames have a similar pattern, the glob module from Python's standard lib is useful         ****REVIEW*****
from glob import glob

filenames = glob('sales*.csv') 
print(type(filenames)) #glob returns an iterable list
dataframes3 = [pd.read_csv(f) for f in filenames]
#list(dataframes3)


<class 'list'>


In [7]:
#Ex1 - Reading DataFrames from multiple files
#goal load three different csv into their own distinct dataframes

# Import pandas
import pandas as pd

# Read 'Bronze.csv' into a DataFrame: bronze
bronze = pd.read_csv('Bronze.csv')

# Read 'Silver.csv' into a DataFrame: silver
silver = pd.read_csv('Silver.csv')

# Read 'Gold.csv' into a DataFrame: gold
gold = pd.read_csv('Gold.csv')

# Print the first five rows of gold
print(gold.head())

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


In [8]:
#Ex2 - Reading DataFrames from multiple files in a loop
#it is more efficient to read multiple csv via a loop or a list comp
#GOAL: apply a loop to load multiple csv files into a df

# Import pandas
import pandas as pd

# Create the list of file names: filenames
filenames = ['Gold.csv', 'Silver.csv', 'Bronze.csv']

# Create the list of three DataFrames: dataframes
dataframes = []
for filename in filenames:
    dataframes.append(pd.read_csv(filename))

# Print top 5 rows of 1st DataFrame in dataframes
print(dataframes[0].head())

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


In [None]:
#for Ex3:
# Read 'Bronze.csv' into a DataFrame: bronze
bronze = pd.read_csv('Bronze.csv')

# Read 'Silver.csv' into a DataFrame: silver
silver = pd.read_csv('Silver.csv')

# Read 'Gold.csv' into a DataFrame: gold
gold = pd.read_csv('Gold.csv')

In [9]:
#Ex3 - COmbining DataFrames from multiple data files
#GOAL: combine three different DF's into a single one called medals
#Note - this specc'd approach is 'clumsy'

# Import pandas
import pandas as pd

# Make a copy of gold: medals
medals = gold.copy()

# Create list of new column labels: new_labels
new_labels = ['NOC', 'Country', 'Gold']
#'Gold' in new_labels replaces 'Total' from the original DF's

# Rename the columns of medals using new_labels
medals.columns = new_labels

# Add columns 'Silver' & 'Bronze' to medals
medals['Silver'] = silver['Total']
medals['Bronze'] = bronze['Total']

# Print the head of medals
print(medals.head())

   NOC         Country    Gold  Silver  Bronze
0  USA   United States  2088.0  1195.0  1052.0
1  URS    Soviet Union   838.0   627.0   584.0
2  GBR  United Kingdom   498.0   591.0   505.0
3  FRA          France   378.0   461.0   475.0
4  GER         Germany   407.0   350.0   454.0


In [81]:
'''
Video 2 - Reindexing DataFrames
How to share info between DFs and their Indexes
This is ESSENTIAL for combining DFs later, as Indexes are the means by which DF rows are labeled
'''
#Indices : many index labels within Index data structures
#Indexes : many pandas Index data structures
meanCols = ['Date', 'Mean TemperatureF']
maxCols = ['Date', 'Max TemperatureF']

w_mean = pd.read_csv('pittsburgh2013.csv', usecols=meanCols, index_col = 'Date', parse_dates=True)
w_mean = w_mean.resample('M').mean()
#Renaming datetime object to month only found here: 
# https://stackoverflow.com/questions/32699950/how-to-convert-pandas-index-to-month-name
w_mean.rename(index=lambda x: x.strftime('%B'), inplace=True)
w_mean.index.name = 'Month'


test = pd.read_csv('pittsburgh2013.csv', usecols=maxCols, index_col='Date',  parse_dates=True)
w_max = test.resample('M').max() #resample works on datetime objects, arg='M' means resample to month
w_max.rename(index=lambda x: x.strftime('%B'), inplace=True)
w_max.index.name = 'Month'


#REMEMBER: The index is a 'priviledged' column in Pandas providing convenient access to
#Series or DataFrame rows

#Dataframe indexes are accessed directly with .index attribute

#can impose a deliberate ordering for index labels using .reindex()
'''
ordered = [some list]
w_mean2 = w_mean.reindex(ordered)
'''

#sort_index() will sort alphabetically

#the arg for .reindex(arg) can be another DF index
#w_mean.reindex(w_max.index)
#when it is suitable, this method spares us from manual list creation or index sorting

#If a new row is inserted during .reindex() the value for that row will be NaN

#.dropna() remove entire rows in which null values occur

#Order matters


           Max TemperatureF
Month                      
January                  68
February                 60
March                    68
April                    84
May                      88
June                     89
July                     91
August                   86
September                90
October                  84
November                 72
December                 68            Mean TemperatureF
Month                       
January            32.354839
February           28.714286
March              35.000000
April              53.100000
May                62.612903
June               70.133333
July               72.870968
August             70.000000
September          63.766667
October            55.451613
November           39.800000
December           34.935484


In [82]:
weather1 = w_max #changing name for Ex4
weather1.describe

<bound method NDFrame.describe of            Max TemperatureF
Month                      
January                  68
February                 60
March                    68
April                    84
May                      88
June                     89
July                     91
August                   86
September                90
October                  84
November                 72
December                 68>

In [87]:
#Ex4 - Sorting DataFrame with the Index & columns
#Useful to rearrange sequence of rows of a DF by sorting
#methods are already implemented via .sort_index() and .sort_values
#GOAL: sort alpha using Index and sort numerically using a column
#Purpose: understand what sorting methods do

# Import pandas
#import pandas as pd #commented out, not necessary for this Jupyter notebook

# Read 'monthly_max_temp.csv' into a DataFrame: weather1
#weather1 = pd.read_csv('monthly_max_temp.csv', index_col = 'Month')

# Print the head of weather1
print(weather1.head())

# Sort the index of weather1 in alphabetical order: weather2
weather2 = weather1.sort_index()

# Print the head of weather2
print(weather2.head())

# Sort the index of weather1 in reverse alphabetical order: weather3
weather3 = weather1.sort_index(ascending=False)

# Print the head of weather3
print(weather3.head())

# Sort weather1 numerically using the values of 'Max TemperatureF': weather4
weather4 = weather1.sort_values('Max TemperatureF')

# Print the head of weather4
print(weather4.head())


          Max TemperatureF
Month                     
January                 68
February                60
March                   68
April                   84
May                     88
          Max TemperatureF
Month                     
April                   84
August                  86
December                68
February                60
January                 68
           Max TemperatureF
Month                      
September                90
October                  84
November                 72
May                      88
March                    68
          Max TemperatureF
Month                     
February                60
January                 68
March                   68
December                68
November                72


In [107]:
#Ex5 - Reindexing DataFrame from a list
#.reindex() is another way to change DF indexes
#this Ex5 is a good review of upsampling and bfill / ffill from pandas Foundations course
#I didn't use Jupyter then so pay attention here
weather1 = pd.DataFrame({'Month':['April', 'Jan', 'Jul', 'Oct'],
                         'Mean TemperatureF': [61.956044, 32.133333, 68.934783, 43.434783]
                        }
                       )
weather1 = weather1.set_index('Month')
#this was to setup for the exercise


year = ['Jan',
 'Feb',
 'Mar',
 'Apr',
 'May',
 'Jun',
 'Jul',
 'Aug',
 'Sep',
 'Oct',
 'Nov',
 'Dec']

#GOAL: Change DF's index by resampling from quarterly to monthly

# Reindex weather1 using the list year: weather2
weather2 = weather1.reindex(year) #used list year to reindex
#list year will reorder by alpha and add more months to the index
#adding months to index will add NaNs by default

# Print weather2
print(weather2)

# Reindex weather1 using the list year with forward-fill: weather3
weather3 = weather1.reindex(year).ffill()
#.ffill() will replace NaN going forward from available values

# Print weather3
print(weather3)


       Mean TemperatureF
Month                   
Jan            32.133333
Feb                  NaN
Mar                  NaN
Apr                  NaN
May                  NaN
Jun                  NaN
Jul            68.934783
Aug                  NaN
Sep                  NaN
Oct            43.434783
Nov                  NaN
Dec                  NaN
       Mean TemperatureF
Month                   
Jan            32.133333
Feb            32.133333
Mar            32.133333
Apr            32.133333
May            32.133333
Jun            32.133333
Jul            68.934783
Aug            68.934783
Sep            68.934783
Oct            43.434783
Nov            43.434783
Dec            43.434783


In [112]:
names_1981 = pd.read_csv('names1981.csv', header=None, names=['name','gender','count'], index_col=(0,1))
#names = ['name', 'gender', 'count'] is like usecols from above
#index_col(0,1) creates a Multi-level index using 'name', 'gender' for the index
names_1881 = pd.read_csv('names1881.csv', header=None, names=['name','gender','count'], index_col=(0,1))

#setup complete for Ex6

(1935, 1)

In [None]:
#Ex6 - Reindexing using another DataFrame Index
#can .reindex(anotherDataFrame'sIndex)
#can access Index of a DF with .index attribute

#GOAL: Use .reindex() and .dropna() to make a new DF 'common_names' counting names from 1881 that
#were still popular in 1981

# Reindex names_1981 with index of names_1881: common_names
common_names = names_1981.reindex(names_1881.index)

# Print shape of common_names
print(common_names.shape)

# Drop rows with null counts: common_names
common_names = common_names.dropna()

# Print shape of new common_names
print(common_names.shape)

In [113]:
'''
Video 3 - Arithmetic with Series & DataFrames
'''
weather = pd.read_csv('pittsburgh2013.csv', index_col ='Date', parse_dates=True)
weather.loc['July 1, 2013': 'July 7, 2013', 'PrecipitationIn']

Date
2013-07-01    0.18
2013-07-02    0.14
2013-07-03    0.00
2013-07-04    0.25
2013-07-05    0.02
2013-07-06    0.06
2013-07-07    0.10
Name: PrecipitationIn, dtype: float64

In [114]:
#Precipitation data is in inches - let's convert them to centimeters
# Scalar multiplication
    # use the * to multiply the series element wise by whatever amount
    #Broadcast = applied to all entries in the DF
weather.loc['July 1, 2013': 'July 7, 2013', 'PrecipitationIn'] * 2.54

Date
2013-07-01    0.4572
2013-07-02    0.3556
2013-07-03    0.0000
2013-07-04    0.6350
2013-07-05    0.0508
2013-07-06    0.1524
2013-07-07    0.2540
Name: PrecipitationIn, dtype: float64

In [119]:
#Absolute Temp range
#find the daily min/max temp expressed as a percentage of the daily mean temp
week1_range = weather.loc['July 1, 2013': 'July 7, 2013', ['Min TemperatureF','Max TemperatureF']]
print(week1_range)
week1_mean = weather.loc['July 1, 2013': 'July 7, 2013', 'Mean TemperatureF']
print(week1_mean, type(week1_mean))

            Min TemperatureF  Max TemperatureF
Date                                          
2013-07-01                66                79
2013-07-02                66                84
2013-07-03                71                86
2013-07-04                70                86
2013-07-05                69                86
2013-07-06                70                89
2013-07-07                70                77
Date
2013-07-01    72
2013-07-02    74
2013-07-03    78
2013-07-04    77
2013-07-05    76
2013-07-06    78
2013-07-07    72
Name: Mean TemperatureF, dtype: int64 <class 'pandas.core.series.Series'>


In [117]:
#Relative Temperature range
week1_range / week1_mean
#the column labels don't match, so the results are NaN values

Unnamed: 0_level_0,2013-07-01 00:00:00,2013-07-02 00:00:00,2013-07-03 00:00:00,2013-07-04 00:00:00,2013-07-05 00:00:00,2013-07-06 00:00:00,2013-07-07 00:00:00,Max TemperatureF,Min TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-07-01,,,,,,,,,
2013-07-02,,,,,,,,,
2013-07-03,,,,,,,,,
2013-07-04,,,,,,,,,
2013-07-05,,,,,,,,,
2013-07-06,,,,,,,,,
2013-07-07,,,,,,,,,


In [118]:
#can use the df Series.divide(Series2, axis='rows')
week1_range.divide(week1_mean, axis='rows')
#.divide() provides more fine-grained control than / operator
#it broadcasts the Series

Unnamed: 0_level_0,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-07-01,0.916667,1.097222
2013-07-02,0.891892,1.135135
2013-07-03,0.910256,1.102564
2013-07-04,0.909091,1.116883
2013-07-05,0.907895,1.131579
2013-07-06,0.897436,1.141026
2013-07-07,0.972222,1.069444


In [120]:
#Compute Percantage changes
# Series.pct_change() * 100
week1_mean.pct_change() * 100 #another broadcast

Date
2013-07-01         NaN
2013-07-02    2.777778
2013-07-03    5.405405
2013-07-04   -1.282051
2013-07-05   -1.298701
2013-07-06    2.631579
2013-07-07   -7.692308
Name: Mean TemperatureF, dtype: float64

In [121]:
#How do arithmetic operations work between distinct Series or DFs with non-aligned indexes?
bronze.describe


<bound method NDFrame.describe of      NOC               Country   Total
0    USA         United States  1052.0
1    URS          Soviet Union   584.0
2    GBR        United Kingdom   505.0
3    FRA                France   475.0
4    GER               Germany   454.0
..   ...                   ...     ...
133  SEN               Senegal     NaN
134  SUD                 Sudan     NaN
135  TGA                 Tonga     NaN
136  BDI               Burundi     NaN
137  UAE  United Arab Emirates     NaN

[138 rows x 3 columns]>

In [122]:
bronze = pd.read_csv('bronze_top5.csv', index_col=0)
bronze

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1052.0
Soviet Union,584.0
United Kingdom,505.0
France,475.0
Germany,454.0


In [123]:
silver = pd.read_csv('silver_top5.csv', index_col = 0)
silver

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1195.0
Soviet Union,627.0
United Kingdom,591.0
France,461.0
Italy,394.0


In [124]:
gold = pd.read_csv('gold_top5.csv', index_col=0)
gold

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,2088.0
Soviet Union,838.0
United Kingdom,498.0
Italy,460.0
Germany,407.0


In [126]:
bronze.index == silver.index

array([ True,  True,  True,  True, False])

In [127]:
silver.index == gold.index

array([ True,  True,  True, False, False])

In [None]:
All three indices have the same first three values
the next two are either 'France', 'Germany', or 'Italy'

In [128]:
#Adding bronze, silver
bronze + silver #added 2 series of 5 rows and returns a series of 6 rows
#the index of the sum is the union from the two series
#Since Germany not in Silver and Italy not in Bronze: NaN in sum


Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,
Italy,
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [139]:
print(bronze.loc['United States', 'Total'], silver.loc['United States', 'Total'])


1052.0 1195.0


In [138]:
bronze.loc['United States', 'Total'] + silver.loc['United States', 'Total']

2247.0

In [None]:
The Total value from bronze + silver is equivalent to the 'United States' values from bronze and silver

In [140]:
#another way to add
bronze.add(silver)

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,
Italy,
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [141]:
# in the .add() can specify fill_value
bronze.add(silver, fill_value=0)
# .add() is more flexible than the + operator

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,454.0
Italy,394.0
Soviet Union,1211.0
United Kingdom,1096.0
United States,2247.0


In [142]:
#adding bronze, silver, gold
bronze + silver + gold
#France, Germany and Italy aren't in all three series, so each of those rows are NaN in sum

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,
Germany,
Italy,
Soviet Union,2049.0
United Kingdom,1594.0
United States,4335.0


In [143]:
#can chain multiple .add() methods
bronze.add(silver, fill_value = 0).add(gold, fill_value=0)


Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,936.0
Germany,861.0
Italy,854.0
Soviet Union,2049.0
United Kingdom,1594.0
United States,4335.0


In [145]:
weather.head() #confirmed matches with provided df in Ex7

Unnamed: 0_level_0,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,Max Sea Level PressureIn,...,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,32,28,21,30,27,16,100,89,77,30.1,...,10,6,2,10,8,,0.0,8,Snow,277
2013-01-02,25,21,17,14,12,10,77,67,55,30.27,...,10,10,10,14,5,,0.0,4,,272
2013-01-03,32,24,16,19,15,9,77,67,56,30.25,...,10,10,10,17,8,26.0,0.0,3,,229
2013-01-04,30,28,27,21,19,17,75,68,59,30.28,...,10,10,6,23,16,32.0,0.0,4,,250
2013-01-05,34,30,25,23,20,16,75,68,61,30.42,...,10,10,10,16,10,23.0,0.21,5,,221


In [146]:
#Ex7 - Broadcasting arithmetic formulas
#GOALS: subset a collection of columsn related to temp measurements in degrees F. convert to degrees C
# relabel columns of new DF to reflect the change of units

# Extract selected columns from weather as new DataFrame: temps_f
temps_f = weather[['Min TemperatureF', 'Mean TemperatureF', 'Max TemperatureF']]

# Convert temps_f to celsius: temps_c
temps_c = (temps_f - 32) * 5/9

# Rename 'F' in column names with 'C': temps_c.columns
temps_c.columns = temps_c.columns.str.replace('F', 'C')

# Print first 5 rows of temps_c
print(temps_c.head())


            Min TemperatureC  Mean TemperatureC  Max TemperatureC
Date                                                             
2013-01-01         -6.111111          -2.222222          0.000000
2013-01-02         -8.333333          -6.111111         -3.888889
2013-01-03         -8.888889          -4.444444          0.000000
2013-01-04         -2.777778          -2.222222         -1.111111
2013-01-05         -3.888889          -1.111111          1.111111


In [148]:
gdp = pd.read_csv('gdp_usa.csv', parse_dates=True, index_col='DATE')
gdp.shape
gdp.describe


<bound method NDFrame.describe of               VALUE
DATE               
1947-01-01    243.1
1947-04-01    246.3
1947-07-01    250.1
1947-10-01    260.3
1948-01-01    266.2
...             ...
2015-04-01  17998.3
2015-07-01  18141.9
2015-10-01  18222.8
2016-01-01  18281.6
2016-04-01  18436.5

[278 rows x 1 columns]>

In [150]:
#Ex8 - Computing percentage growth of GDP
#GOAL: resample to annual sampling then compute annual growth of GDP

# Read 'GDP.csv' into a DataFrame: gdp
#gdp = pd.read_csv('GDP.csv', parse_dates=True, index_col='DATE')

# Slice all the gdp data from 2008 onward: post2008
post2008 = gdp.loc['2008':, :]

# Print the last 8 rows of post2008
print(post2008.tail(8))

# Resample post2008 by year, keeping last(): yearly
yearly = post2008.resample('A').last()

# Print yearly
print(yearly)

# Compute percentage growth of yearly: yearly['growth']
yearly['growth'] = yearly.pct_change() * 100

# Print yearly again
print(yearly)

              VALUE
DATE               
2014-07-01  17569.4
2014-10-01  17692.2
2015-01-01  17783.6
2015-04-01  17998.3
2015-07-01  18141.9
2015-10-01  18222.8
2016-01-01  18281.6
2016-04-01  18436.5
              VALUE
DATE               
2008-12-31  14549.9
2009-12-31  14566.5
2010-12-31  15230.2
2011-12-31  15785.3
2012-12-31  16297.3
2013-12-31  16999.9
2014-12-31  17692.2
2015-12-31  18222.8
2016-12-31  18436.5
              VALUE    growth
DATE                         
2008-12-31  14549.9       NaN
2009-12-31  14566.5  0.114090
2010-12-31  15230.2  4.556345
2011-12-31  15785.3  3.644732
2012-12-31  16297.3  3.243524
2013-12-31  16999.9  4.311144
2014-12-31  17692.2  4.072377
2015-12-31  18222.8  2.999062
2016-12-31  18436.5  1.172707


In [156]:
#Ex9 - Converting currency of stocks
sp500, exchange = pd.read_csv('sp500.csv', parse_dates=True, index_col='Date'), pd.read_csv('exchange.csv', parse_dates=True, index_col='Date')
sp500.shape
exchange.shape

# Import pandas
import pandas as pd

# Read 'sp500.csv' into a DataFrame: sp500
#sp500 = pd.read_csv('sp500.csv', parse_dates=True, index_col='Date')

# Read 'exchange.csv' into a DataFrame: exchange
#exchange = pd.read_csv('exchange.csv', parse_dates=True, index_col='Date')

# Subset 'Open' & 'Close' columns from sp500: dollars
dollars = sp500[['Open', 'Close']]

# Print the head of dollars
print(dollars.head())

# Convert dollars to pounds: pounds
#dollars and exchange shares the datetimeindex, so by using axis='rows' it should align
pounds = dollars.multiply(exchange['GBP/USD'], axis='rows')

# Print the head of pounds
print(pounds.head())

                   Open        Close
Date                                
2015-01-02  2058.899902  2058.199951
2015-01-05  2054.439941  2020.579956
2015-01-06  2022.150024  2002.609985
2015-01-07  2005.550049  2025.900024
2015-01-08  2030.609985  2062.139893
                   Open        Close
Date                                
2015-01-02  1340.364425  1339.908750
2015-01-05  1348.616555  1326.389506
2015-01-06  1332.515980  1319.639876
2015-01-07  1330.562125  1344.063112
2015-01-08  1343.268811  1364.126161
