# NumPy Broadcasting

Two arrays will be broadcast together if they are same size or if one of the dimensions are the same and the other is of size 1

In [1]:
import numpy as np

In [3]:
A = np.array([[1, 2], [3, 4]])

B =  np.array([[2, 5]])

In [5]:
print(B.shape)
print(A.shape)

(1, 2)
(2, 2)


In [7]:
A * B # A(1,1)*B(1,1)  A(1,2)*B(1,2)
      # A(2,1)*B(1,1)   A(2, 2)*B(1,2)

array([[ 2, 10],
       [ 6, 20]])

In [8]:
np.ones((2, 2))

array([[ 1.,  1.],
       [ 1.,  1.]])

In [9]:
np.zeros((2,3))

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [10]:
np.eye(3)

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

# Reshaping arrays

In [13]:
A.reshape((4, 1)) #reshapes the 2,2 A array to a 4 by 1 array of the same elements

array([[1],
       [2],
       [3],
       [4]])

In [14]:
A #reshape does not change original array

array([[1, 2],
       [3, 4]])

In [15]:
D = np.tile(A, (1,1,4))

In [17]:
D #a matrix of A repeated 4 times

array([[[1, 2, 1, 2, 1, 2, 1, 2],
        [3, 4, 3, 4, 3, 4, 3, 4]]])

In [18]:
D.shape

(1, 2, 8)

In [19]:
D = np.tile(A.reshape((2,2,1)), (1,1,4))

In [21]:
print(D)
D.shape

[[[1 1 1 1]
  [2 2 2 2]]

 [[3 3 3 3]
  [4 4 4 4]]]


(2, 2, 4)

# Pandas

In [23]:
import pandas as pd

In [24]:
# create a dictionary with data
data = {'school': ['Texas', 'Texas', 'Texas', 'UGA', 'UGA'], 'year': [2014, 2015, 2016, 2015, 2016], 
       'wins': [6, 5, 5, 10, 8]}

In [25]:
data

{'school': ['Texas', 'Texas', 'Texas', 'UGA', 'UGA'],
 'wins': [6, 5, 5, 10, 8],
 'year': [2014, 2015, 2016, 2015, 2016]}

In [26]:
df = pd.DataFrame(data)

In [27]:
df

Unnamed: 0,school,wins,year
0,Texas,6,2014
1,Texas,5,2015
2,Texas,5,2016
3,UGA,10,2015
4,UGA,8,2016


In [29]:
df.columns #list of col names

Index(['school', 'wins', 'year'], dtype='object')

In [30]:
df.columns.tolist()

['school', 'wins', 'year']

In [33]:
df['wins']

0     6
1     5
2     5
3    10
4     8
Name: wins, dtype: int64

In [34]:
df.loc[4]

school     UGA
wins         8
year      2016
Name: 4, dtype: object

In [37]:
df.loc[0:2]

Unnamed: 0,school,wins,year
0,Texas,6,2014
1,Texas,5,2015
2,Texas,5,2016


In [39]:
df.groupby(['school'])['wins'].sum()

school
Texas    16
UGA      18
Name: wins, dtype: int64

In [42]:
df.groupby(['school'])['wins'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Texas,3.0,5.333333,0.57735,5.0,5.0,5.0,5.5,6.0
UGA,2.0,9.0,1.414214,8.0,8.5,9.0,9.5,10.0


In [44]:
df['wins squared'] = df['wins'] ** 2
df

Unnamed: 0,school,wins,year,wins squared
0,Texas,6,2014,36
1,Texas,5,2015,25
2,Texas,5,2016,25
3,UGA,10,2015,100
4,UGA,8,2016,64


In [45]:
df.head(n=3)

Unnamed: 0,school,wins,year,wins squared
0,Texas,6,2014,36
1,Texas,5,2015,25
2,Texas,5,2016,25


In [53]:
#rename cols
df.rename(columns = {"wins": "Wins", "wins squared": "wins sq"}, inplace = True) 
#dict of form (oldname: newname, oldname: newname...)
#inplace =  True, replaces df with a dataframe with new col names

In [54]:
df

Unnamed: 0,school,Wins,year,win sq
0,Texas,6,2014,36
1,Texas,5,2015,25
2,Texas,5,2016,25
3,UGA,10,2015,100
4,UGA,8,2016,64


# Reading data

In [2]:
ls

[0m[01;32mBuiltinTypes.ipynb[0m*    [01;32mNotes.pdf[0m*             [01;32mPythonNumpyPandas.ipynb[0m*
[01;32mComp_Econ_8-31.ipynb[0m*  [01;32mNumpyIntro.pdf[0m*        [01;32mPythonReadIn.ipynb[0m*
[01;32mCompEcon_9-5.ipynb[0m*    [01;32mOOP.pdf[0m*               [01;32mPythonReshape.ipynb[0m*
[34;42mDataFiles[0m/             [01;32mPS2.pdf[0m*               [01;32mREADME.md[0m*
[01;32mDataFunctions.ipynb[0m*   [01;32mPythonBasics.ipynb[0m*    [01;32mStandardLibrary.pdf[0m*
[01;32mfibo.py[0m*               [01;32mPythonDescribe.ipynb[0m*
[01;32mkisa_2015.csv[0m*         [01;32mPythonFuncs.ipynb[0m*


In [10]:
#kisa_2015.csv
import csv

kisa_data = list(csv.reader(open('kisa_2015.csv'))) 
#read into a list of lists where each element is a row of the csv data
len(kisa_data)

96

In [14]:
kisa_data[1][3] #row 1, column 3...note even numerical data are strings

'57'

In [20]:
# import into a np array
import numpy as np
kisa_data = np.genfromtxt('kisa_2015.csv', delimiter = ",", skip_header = 1)#skips first row

In [21]:
kisa_data[:5] #gets numerical data, but cannot handle strings

array([[  1.20000000e+01,   4.20000000e+01,   5.00000000e+00,
          5.70000000e+01,   4.00000000e+00,   1.00000000e+00,
          1.40000000e+01,   4.00000000e+01,   1.00000000e+00,
          5.70000000e+01,   4.93400000e+04,   2.00000000e+00,
          1.50000000e+01,  -1.00000000e+00,   1.00000000e+00,
          2.01400000e+03,   4.00000000e+00,   1.00000000e+00,
          3.03298119e+03,   8.19000000e+03,   1.00000000e+01,
          8.19000000e+03,   1.00000000e+01,              nan,
          2.01500000e+03,   1.00000000e+00,   0.00000000e+00,
                     nan,   4.00000000e+01,   4.00000000e+01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.69172442e+02,
          2.70433824e+02],
       [  1.20000000e+01,   3.90000000e+01,   7.00000000e+00,
          2.60000000e+01,   4.00000000e+00,   1.00000000e+00,
          1.40000000e+01,   4.00000000e+01,   1.00000000e+00,
          5.70000000e+01,   4.93400000e+04,

In [25]:
np.mean(kisa_data[:, 3]) #mean of column 3, all rows

43.178947368421049

In [27]:
import pandas as pd
#pandas
kisa_data = pd.read_csv('kisa_2015.csv')

In [30]:
kisa_data.head() #can handle strings, numerical, etc. missing values will be NaN

Unnamed: 0,month,grdatn,marstat,age,class,region,state,hours,mlr,natvty,...,homeown,hoursu1b,hoursu1b_t1,se15u,se15u_t1,ent015u,ent015ua,vet,wgtat,wgtat1
0,12,42,5,57,4,1,14,40,1,57,...,,40,40,0,0,0.0,0.0,0,269.172442,270.433824
1,12,39,7,26,4,1,14,40,1,57,...,,40,40,0,0,0.0,0.0,0,403.023478,404.912105
2,12,41,1,43,4,2,41,46,1,110,...,,46,40,0,0,0.0,0.0,0,402.790075,404.677609
3,12,39,1,38,4,2,41,40,1,57,...,,40,30,0,0,0.0,0.0,0,342.934489,344.541531
4,12,42,1,51,-1,3,58,-1,6,57,...,,-1,-1,0,0,0.0,0.0,0,560.224448,562.849743


In [31]:
kisa_data.describe()

Unnamed: 0,month,grdatn,marstat,age,class,region,state,hours,mlr,natvty,...,homeown,hoursu1b,hoursu1b_t1,se15u,se15u_t1,ent015u,ent015ua,vet,wgtat,wgtat1
count,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,95.0,...,0.0,95.0,95.0,95.0,95.0,87.0,85.0,95.0,95.0,95.0
mean,12.0,40.284211,2.568421,43.178947,2.810526,3.210526,69.084211,27.673684,2.557895,88.473684,...,,27.315789,27.221053,0.073684,0.084211,0.0,0.0,0.084211,320.829357,322.332811
std,0.0,2.562675,2.434908,11.816846,2.531902,0.921317,23.262454,21.014403,2.499854,79.168799,...,,20.111531,20.316212,0.262642,0.279177,0.0,0.0,0.279177,127.035039,127.630344
min,12.0,34.0,1.0,20.0,-1.0,1.0,14.0,-1.0,1.0,57.0,...,,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,40.806662,40.997888
25%,12.0,39.0,1.0,33.5,-1.0,3.0,63.0,-1.0,1.0,57.0,...,,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,244.629113,245.775481
50%,12.0,40.0,1.0,44.0,4.0,3.0,71.0,40.0,1.0,57.0,...,,40.0,40.0,0.0,0.0,0.0,0.0,0.0,351.493054,353.140203
75%,12.0,43.0,5.0,52.0,4.0,4.0,86.0,41.5,5.5,57.0,...,,40.0,40.0,0.0,0.0,0.0,0.0,0.0,405.081462,406.979734
max,12.0,46.0,7.0,64.0,7.0,4.0,94.0,65.0,7.0,364.0,...,,65.0,70.0,1.0,1.0,0.0,0.0,1.0,600.702105,603.517084


In [32]:
kisa_data['age'].mean()

43.17894736842105

In [37]:
import pickle

pickle.dump(kisa_data, open('kisa_df.pkl', 'wb')) #wb allows for compatability across OS

In [35]:
kisa2 = pickle.load(open('kisa_df.pkl', 'rb'))

In [36]:
kisa2

Unnamed: 0,month,grdatn,marstat,age,class,region,state,hours,mlr,natvty,...,homeown,hoursu1b,hoursu1b_t1,se15u,se15u_t1,ent015u,ent015ua,vet,wgtat,wgtat1
0,12,42,5,57,4,1,14,40,1,57,...,,40,40,0,0,0.0,0.0,0,269.172442,270.433824
1,12,39,7,26,4,1,14,40,1,57,...,,40,40,0,0,0.0,0.0,0,403.023478,404.912105
2,12,41,1,43,4,2,41,46,1,110,...,,46,40,0,0,0.0,0.0,0,402.790075,404.677609
3,12,39,1,38,4,2,41,40,1,57,...,,40,30,0,0,0.0,0.0,0,342.934489,344.541531
4,12,42,1,51,-1,3,58,-1,6,57,...,,-1,-1,0,0,0.0,0.0,0,560.224448,562.849743
5,12,42,1,50,4,3,58,50,1,57,...,,50,60,0,0,0.0,0.0,0,404.438860,406.334120
6,12,39,7,33,4,1,16,-1,4,57,...,,-1,-1,0,0,0.0,0.0,0,411.947146,413.877591
7,12,40,4,52,5,1,16,42,1,57,...,,40,10,0,0,0.0,,0,372.723136,374.469772
8,12,43,1,41,4,4,81,30,1,57,...,,30,30,0,0,0.0,0.0,0,40.806662,40.997888
9,12,39,1,43,6,4,81,40,1,57,...,,40,40,1,1,,,0,46.816138,47.035525


# Functions

In [40]:
def product(val1, val2):
    prod = val1 * val2
    
    return prod

In [41]:
product(5, 10)

50

In [44]:
# lambda fn

f = lambda x, y: x*y #takes inputs x, y and returns everything after : (product)
f(4, 5)

20