In [None]:
'''Video 1 - Appending and concatenating Series
We can now combine DFs
'''

In [None]:
#V1 - Stack Series on top of each other by concatenating
#Tools - .append() and pd.concat()
#for append() - Series1.append(Series2)
    #rows of Series2 are stacked UNDERNEATH Series1
        #this means it can only stack VERTICALLY
    #works for both DFs and Series
    
#for concat() - pd.concat([s1, s2, s3])
    #a pandas module function
    #accepts a LIST or sequence of several Series of DFs to concatenate
    #can stack row-wise (Vertically) or column-wise (horizontally)
    
'''
When stacking multiple Series .concat() == .append()
'''
    

In [1]:
import pandas as pd
northeast = pd.Series(['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA'])

south = pd.Series(['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX'])

midwest = pd.Series(['IL', 'IN', 'MN', 'MO', 'NE', 'ND', 'SD', 'IA', 'KS', 'MI', 'OH', 'WI'])

west = pd.Series(['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR', 'WA'])

In [2]:
#Remember all four Series are indexed by integers starting at 0
east = northeast.append(south)
east

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
0     DE
1     FL
2     GA
3     MD
4     NC
5     SC
6     VA
7     DC
8     WV
9     AL
10    KY
11    MS
12    TN
13    AR
14    LA
15    OK
16    TX
dtype: object

In [3]:
#note the indexing! keeps the integer index from each series
#append() does not adjust the index labels
east.index
#east has the index from the original 2 Series in order

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')

In [4]:
#using .loc[3] will return 2 rows: one from northeast and one from south
east.loc[3]

3    NH
3    MD
dtype: object

In [5]:
#having an unique index is pretty important, no?
#reset_index(drop=True) to the rescue
#drop=True discards the old index with repeated entries
new_east = northeast.append(south).reset_index(drop=True)
new_east.head(11)

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object

In [6]:
type(new_east.index)
#the new_east Index is type Range

pandas.core.indexes.range.RangeIndex

In [7]:
#For east .concat() is the same as append() "BECAUSE WE ARE DOING VERTICAL STACKING"
east = pd.concat([northeast, south]) #required input is a list of Series or DF's
east.head(11)
#note how the index resets after PA. Keeps the same indexing as the original inputs

0    CT
1    ME
2    MA
3    NH
4    RI
5    VT
6    NJ
7    NY
8    PA
0    DE
1    FL
dtype: object <class 'pandas.core.indexes.numeric.Int64Index'>


In [8]:
#note .concat() produces a numeric (NOT RANGE) index with Int64 if we don't ignore_index=True
#can also create new_east like .append()
new_east = pd.concat([northeast, south], ignore_index=True)
print(new_east.head(11), type(new_east.index))

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object <class 'pandas.core.indexes.range.RangeIndex'>


In [None]:
#Index are all unique, and because we used reset_index=True the type changes to Range instead of Numeric
''' 
End V1
'''

In [None]:
#Ex1 - Appending Series w/ nonunique Indices
non-unique indices will be repeated.

In [10]:
jan = pd.read_csv('sales-jan-2015.csv')
feb = pd.read_csv('sales-feb-2015.csv')
mar = pd.read_csv('sales-mar-2015.csv')
print(jan.shape, feb.shape, mar.shape)

(20, 4) (20, 4) (20, 4)


In [11]:
#Ex2
#GOAL: Load data, extract Series from 'Units' column from each DF, and append them together w/ method chaining using .append()

# Import pandas
import pandas as pd

# Load 'sales-jan-2015.csv' into a DataFrame: jan
jan = pd.read_csv('sales-jan-2015.csv', index_col='Date', parse_dates=True)

# Load 'sales-feb-2015.csv' into a DataFrame: feb
feb = pd.read_csv('sales-feb-2015.csv', index_col='Date', parse_dates=True)

# Load 'sales-mar-2015.csv' into a DataFrame: mar
mar = pd.read_csv('sales-mar-2015.csv', index_col='Date', parse_dates=True)

# Extract the 'Units' column from jan: jan_units
jan_units = jan['Units']

# Extract the 'Units' column from feb: feb_units
feb_units = feb['Units']

# Extract the 'Units' column from mar: mar_units
mar_units = mar['Units']

# Append feb_units and then mar_units to jan_units: quarter1
quarter1 = jan_units.append(feb_units).append(mar_units)

#Verify that quarter1 has the individual Series stacked Vertically by doing the following:
# Print the first slice from quarter1
print(quarter1.loc['jan 27, 2015':'feb 2, 2015'])

# Print the second slice from quarter1
print(quarter1.loc['feb 26, 2015': 'mar 7, 2015'])

# Compute & print total sales in quarter1
print(quarter1.sum())

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64
Date
2015-02-26 08:57:45     4
2015-02-26 08:58:51     1
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64
642


In [12]:
#Ex3 - Concatenating pandas Series along row axis
# Learned append in Ex2, will learn .concat() in this one
#GOAL: achieve the same using .concat() as we did with method chaining .append()
#Question? What is the diff between .concat() and .append()
#Answer: .append() is specific case of concatenation (I'm guessing for Vertical stacking)
    # while .concat() offers more flexibility *WILL LEARN LATER

# Initialize empty list: units
units = []

# Build the list of Series
for month in [jan, feb, mar]:
    units.append(month['Units'])

# Concatenate the list: quarter1
quarter1 = pd.concat(units, axis='rows') #axis specc'd means we want to stack Vertically

# Print slices from quarter1
print(quarter1.loc['jan 27, 2015':'feb 2, 2015'])
print(quarter1.loc['feb 26, 2015':'mar 7, 2015'])  
#should look exactly like Ex2 - confirm NS

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64
Date
2015-02-26 08:57:45     4
2015-02-26 08:58:51     1
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64


In [None]:
'''
Video 2- Appending and Concatenating DataFrames
How do .append() and pd.concat() work with DF's? Let's find out!
'''

In [25]:

pop1 = pd.DataFrame({
                    'Zip Code ZCTA': ['66407', '72732', '50579', '46241'], 
                    '2010 Census Population': ['479', '4716', '2405', '30670']
                    })
pop1.set_index('Zip Code ZCTA', inplace=True)

pop2 = pd.DataFrame({
                    'Zip Code ZCTA': ['12776', '76092', '98360', '49464'],
                    '2010 Census Population': ['2180', '26669', '12221', '27481']
                    })
pop2.set_index('Zip Code ZCTA', inplace=True)


In [26]:
#when appended these DF's are stacked Row Wise (VERTICALLY)
pop1.append(pop2)


Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670
12776,2180
76092,26669
98360,12221
49464,27481


In [28]:
#the two DFs have identical index and column names
print(pop1.index.name, pop1.columns)
print(pop2.index.name, pop2.columns)

Zip Code ZCTA Index(['2010 Census Population'], dtype='object')
Zip Code ZCTA Index(['2010 Census Population'], dtype='object')


In [36]:
#.append() and .concat() preserve the row indices in this case
population = pd.DataFrame({
                        'Zip Code ZCTA': ['57538', '59916', '37660', '2860'],
                        '2010 Census Population': [322, 130, 40038, 45199]
                        })
population.set_index('Zip Code ZCTA', inplace=True)
unemployment = pd.DataFrame({
                'Zip': ['2860', '46167', '1097', '80808'],
                'unemployment': [0.11, 0.02, 0.33, 0.07],
                'participants': [34447, 4800, 42, 4310]
               })
unemployment.set_index('Zip', inplace=True)

population


Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
57538,322
59916,130
37660,40038
2860,45199


In [33]:
unemployment

Unnamed: 0_level_0,unemployment,participants
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1
2860,0.11,34447
46167,0.02,4800
1097,0.33,42
80808,0.07,4310


In [31]:
#pop and unemployment have different index names
print(population.index.name, unemployment.index.name)

Zip Code ZCTA Zip


In [None]:
#Note how 2860 is the only shared index value between population and unemployment

In [37]:
population.append(unemployment) #produces a DF with 8 rows and 3 columns

Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


In [None]:
#columns are the union from the 2 inputted DF's
#top 4 rows are from Population df have NaN values filling participant and unemployment columns
#bottom 4 rows are from unemployment df and have NaN Values filling in 2010 Census Population column
#NaN are inserted because the appended DFs have disjoint column labels
#topRow.append(thisgoesbelow_topRow)

#note how 2860 has 2 rows...
#1st 2860 row comes from population
#2nd 2860 comes from unemployment

In [38]:
pd.concat([population, unemployment], axis=0) #axis=0 means stacking rows vertically at the bottom

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


In [39]:
#note axis=0 or axis='row' is optional. It is also the default behavior
#axis = 1 or axis='columns' stacks DF horizontally on the right

pd.concat([population, unemployment], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


In [71]:
#horizontally stacked Df has 7 rows and 3 columns
#the rows with index val 2860 gets aligned with concatenating horizontally 
#values get propagated from both unemployment and population DF to fill that row - no more NaN in 4860

#NaNs will populate columns if they are disjointed 
#this is called an 'OUTER JOIN'
''' Video 2 - END'''

' Video 2 - END'

In [107]:
#Ex4 - Appending DataFrames with ignore_index
#Question - what if the concatenation axis lack meaningful index information?

names_1881 = pd.read_csv('names1881.csv', header=None)
names_1981 = pd.read_csv('names1981.csv', header=None)
#confirmed matched with DataCamp exercise - NS

# Add 'year' column to names_1881 and names_1981
names_1881['year'] = 1881
names_1981['year'] = 1981 #this is a scalar, will broadcast value throughout


# Append names_1981 after names_1881 with ignore_index=True: combined_names
combined_names = names_1881.append(names_1981, ignore_index=True) #ignore_index=True makes a new RangeIndex of unique integers

# Print shapes of names_1981, names_1881, and combined_names
print(names_1981.shape)
print(names_1881.shape)
print(combined_names.shape)
combined_names.rename(columns = {0:'name', 1:'gender', 2:'count'}, inplace=True)
# Print all rows that contain the name 'Morgan'
#myFilter = combined_names.name == 'Morgan' #DataCamp won't take this answer
#print(combined_names.loc[myFilter])

#print(combined_names[combined_names.name=='Morgan']) #Datacamp won't take this answer too!

#from some website: *** DON"T KNOW WHY IT WASN"T IN MY PREVIOUS NOTES - STUDY***
#df.loc[df['favorite_color'] == 'yellow']
print(combined_names.loc[combined_names['name'] == 'Morgan']) #this was accepted!
#.loc[df['column name'] == 'Value']

(19455, 4)
(1935, 4)
(21390, 4)
         name gender  count  year
1283   Morgan      M     23  1881
2096   Morgan      F   1769  1981
14390  Morgan      M    766  1981


In [111]:
#Ex5 - Concatenating pandas DataFrame along column axis
#pd.concat() can concat DFs horizontally as well as vertically. Vert is default
#to do horizontal concat MUST specify axis=1 or axis='columns'
#GOAL: Use different datasets sampled at different rates (quarterly vs monthly). Concat rows of both and see where 
# rows are missing , NaNs are inserted. This corresponds to 'outer join'

weather_max = pd.DataFrame({
                          'Month': ['Jan', 'Apr', 'Jul', 'Oct'],
                          'Max TemperatureF': [68, 89, 91, 84]
                          })
weather_max.set_index('Month', inplace=True)
weather_max

Unnamed: 0_level_0,Max TemperatureF
Month,Unnamed: 1_level_1
Jan,68
Apr,89
Jul,91
Oct,84


In [112]:
weather_mean = pd.DataFrame({
                           'Month': ['Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'],
                           'Mean Temperature F': [53.100000, 70.00000, 34.935484, 28.714286, 32.354839, 72.870968, 70.133333, 35.00000, 62.612903, 39.800000, 55.451613, 63.766667]
                            })
weather_mean.set_index('Month', inplace=True)
weather_mean

Unnamed: 0_level_0,Mean Temperature F
Month,Unnamed: 1_level_1
Apr,53.1
Aug,70.0
Dec,34.935484
Feb,28.714286
Jan,32.354839
Jul,72.870968
Jun,70.133333
Mar,35.0
May,62.612903
Nov,39.8


In [113]:
 # Create a list of weather_max and weather_mean
weather_list = [weather_max, weather_mean]

# Concatenate weather_list horizontally
weather = pd.concat(weather_list, axis=1)

# Print weather
print(weather)

     Max TemperatureF  Mean Temperature F
Apr              89.0           53.100000
Aug               NaN           70.000000
Dec               NaN           34.935484
Feb               NaN           28.714286
Jan              68.0           32.354839
Jul              91.0           72.870968
Jun               NaN           70.133333
Mar               NaN           35.000000
May               NaN           62.612903
Nov               NaN           39.800000
Oct              84.0           55.451613
Sep               NaN           63.766667


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


In [115]:
#Ex6 - Reading multiple files to build a DataFrame *** PRETTY FUCKING IMPORTANT - NS ***
#Convenient to parse many files as DF and concat them all at once
#Goal concat 3 files together
medal_types = ['bronze', 'silver', 'gold']

#Initialize an empyy list: medals
medals =[]

for medal in medal_types:
    # Create the file name: file_name
    file_name = "%s_top5.csv" % medal
    # Create list of column names: columns
    columns = ['Country', medal]
    # Read file_name into a DataFrame: medal_df
    medal_df = pd.read_csv(file_name, header=0, index_col ='Country', names=columns)
    # Append medal_df to medals
    medals.append(medal_df)

# Concatenate medals horizontally: medals_df
medals_df = pd.concat(medals, axis='columns')

# Print medals_df
print(medals_df)
#Note, Germany and Italy are probably 'outer joins'

                bronze  silver    gold
France           475.0   461.0     NaN
Germany          454.0     NaN   407.0
Italy              NaN   394.0   460.0
Soviet Union     584.0   627.0   838.0
United Kingdom   505.0   591.0   498.0
United States   1052.0  1195.0  2088.0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [None]:
'''
Video 3 - Concatenation, keys, and MultiIndexes
'''

In [None]:
#Often want to concat Dfs that share common row labels and common column names
#pretend rain2013 and rain2014 share index [Jan, Feb, Mar] and column anme = 'Precipitation'
pd.concat([rain2013, rain2014], axis = 0) #concat vertically
#single column with repeated rows. This obscures the facts that the months from the bottom 3 rows are from
# different years than the months from the top 3 rows

#How to fix? - Use Multi-level index for the rows
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis=0) #keys will assign an outer index label assoc. w/ each
# oof the origninal input DataFrames
#the order of teh list of keys must match the order of the list of DataFrames

#multi index can be sliced with .loc[a key from keys]
rain1314.loc[2014]

In [None]:
#A different strategy is to concat to the right - 'Horizontally'
#axis = 1 or axis='columns'
#since the precipitation column is common to both Df's the result has two precip columns
#which obscures which column comes from which year

#USE a multi-index on columns
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis='columns')

In [None]:
#.concat() can also accept a dict (rather than a list) of DFs as input
rain_dict = {2013: rain2013, 2014:rain2014} #here the keys are defined as part of dict
rain1314 = pd.concat(rain_dict, axis='columns') #while 2 precip columns persist, keys were defined already
rain1314 #don't run this won't show anything as I was too lazy to define the rain2013 rain2014 DF's
''' Video 4 - END'''

In [116]:
#Ex7 - Concatenating vertically to get MultiIndexed Rows
medals = []
for medal in medal_types:

    file_name = "%s_top5.csv" % medal
    
    # Read file_name into a DataFrame: medal_df
    medal_df = pd.read_csv(file_name, index_col='Country')
    
    # Append medal_df to medals
    medals.append(medal_df)
    
# Concatenate medals: medals
medals = pd.concat(medals, keys=['bronze', 'silver', 'gold'])

# Print medals in entirety
print(medals)

                        Total
       Country               
bronze United States   1052.0
       Soviet Union     584.0
       United Kingdom   505.0
       France           475.0
       Germany          454.0
silver United States   1195.0
       Soviet Union     627.0
       United Kingdom   591.0
       France           461.0
       Italy            394.0
gold   United States   2088.0
       Soviet Union     838.0
       United Kingdom   498.0
       Italy            460.0
       Germany          407.0


In [120]:
#Ex8 - Slicing MultiIndex DataFrames
#GOAL: Using the DF created in Ex7 sort the dataFrame and use pd.IndexSlice to extract specific slices
# Sort the entries of medals: medals_sorted
medals_sorted = medals.sort_index(level=0) #sort by the outermost index (bronze, silver, gold) and the values in Total column
#print(medals_sorted)
# Print the number of Bronze medals won by Germany
print(medals_sorted.loc[('bronze','Germany')]) #.loc[(outermost index, innermost index)]

# Print data about silver medals
print(medals_sorted.loc['silver']) #.loc[outermost index]

# Create alias for pd.IndexSlice: idx
idx = pd.IndexSlice #a slicer is required when slicing on the INNER level of a MultiIndex

# Print all the data on medals won by the United Kingdom
print(medals_sorted.loc[idx[:, 'United Kingdom'], :]) #slice inner index for value 'United Kingdom' and return all columns

Total    454.0
Name: (bronze, Germany), dtype: float64
                 Total
Country               
France           461.0
Italy            394.0
Soviet Union     627.0
United Kingdom   591.0
United States   1195.0
                       Total
       Country              
bronze United Kingdom  505.0
gold   United Kingdom  498.0
silver United Kingdom  591.0


In [None]:
#Ex9 - Concatenating Horizontally to get MultiIndexed columns
# Concatenate dataframes: february
february = pd.concat(dataframes, axis='columns', keys=['Hardware', 'Software', 'Service'])

# Print february.info()
print(february.info())

# Assign pd.IndexSlice: idx
idx = pd.IndexSlice #we're gonna slice some innermost multi-indexed column
#there are three columns in the nested dfs: 'Company', 'Products', 'Units'  

# Create the slice: slice_2_8
slice_2_8 = february.loc['Feb 2, 2015':'Feb 8, 2015', idx[:, 'Company']]

# Print slice_2_8
print(slice_2_8)

In [122]:

jan.shape

(20, 3)

In [123]:
jan.head(2)

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-21 19:13:21,Streeplex,Hardware,11
2015-01-09 05:23:51,Streeplex,Service,8


In [124]:
jan.reset_index(inplace=True)
jan.shape

(20, 4)

In [125]:
feb.reset_index(inplace=True)
feb.shape

(20, 4)

In [126]:
mar.reset_index(inplace=True)
mar.shape

(20, 4)

In [133]:
#ok all set for Ex10
#Ex10 = Concatenating DFs from a dict
#GOAL: Aggregate the sum of all sales over the 'Company' coolumn into a single DF
# Construct a dict of these DFs then concat them


# Make the list of tuples: month_list
month_list = [('january', jan), ('february', feb), ('march', mar)]

# Create an empty dictionary: month_dict
month_dict = dict()

for month_name, month_data in month_list:

    # Group month_data: month_dict[month_name]
    month_dict[month_name] = month_data.groupby('Company').sum()

# Concatenate data in month_dict: sales
sales = pd.concat(month_dict)

# Print sales
print('\tSALES', '\n', sales)

# Print all sales by Mediacore
idx = pd.IndexSlice
print('\n\tAll Sales by Mediacore', '\n', sales.loc[idx[:, 'Mediacore'], :])

	SALES 
                           Units
         Company               
january  Acme Coporation     76
         Hooli               70
         Initech             37
         Mediacore           15
         Streeplex           50
february Acme Coporation     34
         Hooli               30
         Initech             30
         Mediacore           45
         Streeplex           37
march    Acme Coporation      5
         Hooli               37
         Initech             68
         Mediacore           68
         Streeplex           40

	All Sales by Mediacore 
                     Units
         Company         
january  Mediacore     15
february Mediacore     45
march    Mediacore     68


In [None]:
''' Video 5 - Outer and Inner Joins'''
#Cheat Sheet
#vertically axis= 0
#horizontally axis = 1
#Outer Join - preserves the indices in the original tables filling NaNs for missing rows
#Inner Join - has only index labels common to both tables (like a set intersection)

In [147]:
#how does concat DF and Series work? MATH!!!
import numpy as np
import pandas as pd
A = np.arange(8).reshape(2,4) + 0.1 #2x4 Matrix
B = np.arange(6).reshape(2,3) + 0.2 #2x3 Matrix
C = np.arange(12).reshape(3,4) + 0.3 #3x4 Matrix
#Matrices were made...
C

array([[ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [135]:
#B and A can be stacked Horizontally
np.hstack([B, A]) #input is a list of np arrays

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [136]:
#can also do this with np.concatenate([list of np arrays], axis=1) will append columns horizontally
np.concatenate([B, A], axis=1)

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [None]:
#NOTE A and B MUST have the same number of rows but the number of columns can differ

In [137]:
#can stack vertically using np.vstack() or np.concatenate([list of np arrays], axis=0)
np.vstack([A, C]) #Note A and C have same number of columns

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [138]:
np.concatenate([A, C], axis=0) #arg axis=0 is default so it is optional

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [144]:
#A ValueError exception is raised when arrays have different sizes along the concatenation axis
np.concatenate([A,B], axis=0) #execpt ValueError
#vertical depends on column sizes, A has 2 B has 3 so the axis is misaligned

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [148]:
np.concatenate([A,C], axis=1)
#horizontal depends on row size, A has 2 rows; C has 3. Concatenating axis is misaligned

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [149]:
population

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
57538,322
59916,130
37660,40038
2860,45199


In [150]:
unemployment

Unnamed: 0_level_0,unemployment,participants
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1
2860,0.11,34447
46167,0.02,4800
1097,0.33,42
80808,0.07,4310


In [152]:
#The indices of these 2 DFs differ in all but one row: 2860

In [154]:
population_array = np.array(population)
print(population_array) #note index info is lost
#numpy array of 4x1 is created

[[  322]
 [  130]
 [40038]
 [45199]]


In [155]:
unemployment_array = np.array(unemployment)
print(unemployment_array) #4x2 np array is created - no index info

[[1.1000e-01 3.4447e+04]
 [2.0000e-02 4.8000e+03]
 [3.3000e-01 4.2000e+01]
 [7.0000e-02 4.3100e+03]]


In [None]:
#concatenating these two np arrays is meaningless. All the index and column labels are lost
#We got to join tables: 'meaningfully gluing index rows together' 
#also - 'Combining rows of multiple tables'
#Outer Join - preserves the indices in the original tables filling NaNs for missing rows
    #outer join table has all the indices of the original table w/o repeition (like a set union)

#Inner Join - has only index labels common to both tables (like a set intersection)

In [156]:
#Concatenation and Inner join
pd.concat([population, unemployment], axis=1, join='inner') #horizontal inner join

#the only row index label common to both table is 2860 so expect only 1 row

#column values are filled in from the corresponding columns of pop and unemployment DFs respectively

Unnamed: 0,2010 Census Population,unemployment,participants
2860,45199,0.11,34447


In [157]:
#Concatenation and Outer Join
pd.concat([population, unemployment], axis=1, join='outer') #all indices w/o repetition so expect only 1 2860 row

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


In [159]:
#Inner Join on other axis

#can do inner join along axis = 0 
test = pd.concat([population, unemployment], axis=0, join='inner') #vertical inner join
#empty because no column index label occurs within pop and unemployment

In [160]:
test.columns

Index([], dtype='object')

In [161]:
test.index

Index(['57538', '59916', '37660', '2860', '2860', '46167', '1097', '80808'], dtype='object')

In [163]:
type(test)

pandas.core.frame.DataFrame

In [None]:
''' Video 5 - END'''

In [175]:
idx = pd.IndexSlice
bronze = medals.loc[idx['bronze'], :]
silver = medals.loc[idx['silver'], :]
gold = medals.loc[idx['gold'], :]
#ok set up confirm for Ex11

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,2088.0
Soviet Union,838.0
United Kingdom,498.0
Italy,460.0
Germany,407.0


In [179]:
#Ex11 - Concatenating DFs with inner joins
#GOAL: compute an inner joing

# Create the list of DataFrames: medal_list
medal_list = [bronze, silver, gold]

# Concatenate medal_list horizontally using an inner join: medals
medals = pd.concat(medal_list, join='inner', axis=1, keys=['bronze', 'silver', 'gold'])

# Print medals
print(medals)

                bronze  silver    gold
                 Total   Total   Total
Country                               
United States   1052.0  1195.0  2088.0
Soviet Union     584.0   627.0   838.0
United Kingdom   505.0   591.0   498.0


In [206]:
china = pd.read_csv('gdp_china.csv', index_col = 'Year', parse_dates=True)
us = pd.read_csv('gdp_usa.csv', index_col = 'DATE', parse_dates=True)
#seems my csv files has an extra row at position 0, my calcs will be off from DataCamps

In [207]:
china.rename(columns={'GDP': 'China'}, inplace=True)
china.head()
#china.shape
us.shape

(278, 1)

In [198]:
china.drop(china.index[0], inplace=True)
china.shape

(55, 1)

In [199]:
china.head()
#china is wrangled to match DataCamp NS

Unnamed: 0_level_0,China
Year,Unnamed: 1_level_1
1961-01-01,49.55705
1962-01-01,46.685179
1963-01-01,50.097303
1964-01-01,59.062255
1965-01-01,69.709153


In [209]:
us.head() #column 'VALUE' must be renamed to US and the first row .index[0] must be dropped - NS
us.shape

(278, 1)

In [212]:
us.rename(columns={'VALUE':'US'}, inplace=True)
us.drop(us.index[0], inplace=True)
us.head()
us.shape
#us is wrangled to match DataCamp NS

(277, 1)

In [213]:
#Ex12 - Resampling & Concatenating DFs  with inner join
#Note US starts in 1947 and is recorded quarterly
#Note China starts in 1961 and is recorded annually

#Use a combo of resampling and inner join to align index labels
#use offset alias https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

#Spec - use .resample() method and some aggregatoin method like .pct_change and .last() in a method chain


# Resample and tidy china: china_annual
china_annual = china.resample('A').last().pct_change(10).dropna()

# Resample and tidy us: us_annual
us_annual = us.resample('A').last().pct_change(10).dropna()

# Concatenate china_annual and us_annual: gdp
gdp = pd.concat([china_annual, us_annual], join='inner', axis=1)

# Resample gdp and print
print(gdp.resample('10A').last()) #10A means every decade
#Note my resampling does not match DataCamps... somewhere the data was off

               China        US
Year                          
1970-12-31  0.546128  1.017187
1980-12-31  1.072537  1.742556
1990-12-31  0.892820  1.012126
2000-12-31  2.357522  0.738632
2010-12-31  4.011081  0.454332
2020-12-31  3.789936  0.361780
