# Numpy & Pandas basics

## Numpy
1. Numpy arrays
2. Basic operations

## Pandas
1. Loading tabular data into dataframe
2. Prevewing data
3. Selecting & Querying
4. Modifying dataframe
5. Group by

# Numpy

In [1]:
import numpy as np

## 1. Numpy arrays

In [2]:
# 1-dimensional numpy array
a_1d = np.array([1,2,3]) # slightly different from column vector & row vector
print(a_1d)
print(type(a_1d))
print(a_1d.shape)
print(a_1d[0], a_1d[-1])

a_1d[2] = 9
print(a_1d)

# 2-dimensional numpy array
a_2d = np.array([[1,2,3]]) # you can regard it as a row vector
print(a_2d)
print(type(a_2d))
print(a_2d.shape)

b_2d = np.array([[1],[2],[3]]) # you can regard it as a column vector
print(b_2d)
print(b_2d.shape)

c_2d = np.array([[1,2,3], [5,6,7]]) # you can regard it as a matrix
print(c_2d)
print(c_2d.shape)
print(c_2d[1,2])

[1 2 3]
<class 'numpy.ndarray'>
(3,)
1 3
[1 2 9]
[[1 2 3]]
<class 'numpy.ndarray'>
(1, 3)
[[1]
 [2]
 [3]]
(3, 1)
[[1 2 3]
 [5 6 7]]
(2, 3)
7


In [3]:
print(a_2d)
print('-----------------')
print(a_2d.T)
print('-----------------')
print(a_2d.reshape( [3,1] ))
print('-----------------')
print('-----------------')
print(c_2d)
print('-----------------')
print(c_2d.T)
print('-----------------')
print(c_2d.reshape(-1))
print('-----------------')
print(c_2d.reshape( [3,2] ))

[[1 2 3]]
-----------------
[[1]
 [2]
 [3]]
-----------------
[[1]
 [2]
 [3]]
-----------------
-----------------
[[1 2 3]
 [5 6 7]]
-----------------
[[1 5]
 [2 6]
 [3 7]]
-----------------
[1 2 3 5 6 7]
-----------------
[[1 2]
 [3 5]
 [6 7]]


In [4]:
d_2d = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

print(d_2d)
print('-----------------')
print(d_2d.sum(axis=0))
print(d_2d.sum(axis=1))
print('-----------------')
print(d_2d[:2, 3])
print('-----------------')
print(d_2d>6)
print('-----------------')
print(d_2d[ d_2d>6   ])
print('-----------------')
d_2d[d_2d>6] = 100
print(d_2d)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
-----------------
[15 18 21 24]
[10 26 42]
-----------------
[4 8]
-----------------
[[False False False False]
 [False False  True  True]
 [ True  True  True  True]]
-----------------
[ 7  8  9 10 11 12]
-----------------
[[  1   2   3   4]
 [  5   6 100 100]
 [100 100 100 100]]


## 2. Basic operations

In [5]:
# element-wise operation

x = np.array([[1, 2], [3, 4]])
y = np.array([[5, 6], [7, 8]])
print(x)
print(y)
print('add -----------------')
print( x + y )
print(np.add(x, y))
print('subtract -----------------')
print( x - y )
print(np.subtract(x, y))
print('multiply -----------------')
print( x * y )
print(np.multiply(x, y))
print('divide -----------------')
print( x / y )
print(np.divide(x, y))
print('power&sqrt -----------------')
print( x ** y )
print(np.sqrt(x))

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]
add -----------------
[[ 6  8]
 [10 12]]
[[ 6  8]
 [10 12]]
subtract -----------------
[[-4 -4]
 [-4 -4]]
[[-4 -4]
 [-4 -4]]
multiply -----------------
[[ 5 12]
 [21 32]]
[[ 5 12]
 [21 32]]
divide -----------------
[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]
power&sqrt -----------------
[[    1    64]
 [ 2187 65536]]
[[1.         1.41421356]
 [1.73205081 2.        ]]


In [6]:
# matrix multiplication
X = np.array([[1, 2], [3, 4]])
Y = np.array([[5, 6], [7, 8]])
print(X)
print(Y)

print('XY -----------------')
print(np.matmul(X,Y))
print('YX -----------------')
print(np.matmul(Y,X))

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]
XY -----------------
[[19 22]
 [43 50]]
YX -----------------
[[23 34]
 [31 46]]


In [7]:
# (+Optional)Broadcasting
d_2d = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

v = np.array([4,4,4,4])
z = np.array([3,3,3]).reshape([3,1])
print(d_2d)
print('--------------')
print(v)
print(z)
print('d_2d + v --------------')
print(d_2d + v)
print('d_2d + z --------------')
print(d_2d + z)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
--------------
[4 4 4 4]
[[3]
 [3]
 [3]]
d_2d + v --------------
[[ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]
d_2d + z --------------
[[ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]


In [8]:
# Numpy array to list
x = np.array([1,2,3,4,5,6,7,8])
print(x)
print(type(x))
print(' array-->list ')
z = x.tolist()
print(z)
print(type(z))


[1 2 3 4 5 6 7 8]
<class 'numpy.ndarray'>
 array-->list 
[1, 2, 3, 4, 5, 6, 7, 8]
<class 'list'>


# Pandas

In [9]:
import pandas as pd

## 1. Loading tabular data into dataframe

In [10]:
url = 'https://raw.githubusercontent.com/RayleighKim/Example_datasets/master/Graduate_apply.csv'

df = pd.read_csv(url)

df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


## 2. Previewing data

In [11]:
# Show first 5 rows
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [12]:
# Show last 3 rows
df.tail(3)

Unnamed: 0,admit,gre,gpa,rank
397,0,460,2.63,2
398,0,700,3.65,2
399,0,600,3.89,3


In [13]:
# show the shape of a dataframe
df.shape

(400, 4)

In [14]:
# show the names of columns
df.columns

Index(['admit', 'gre', 'gpa', 'rank'], dtype='object')

In [15]:
# show a concise summary of a dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   admit   400 non-null    int64  
 1   gre     400 non-null    int64  
 2   gpa     400 non-null    float64
 3   rank    400 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 12.6 KB


In [16]:
# show basic descriptive statistics
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [17]:
# Dataframe --> numpy array
df.values

array([[  0.  , 380.  ,   3.61,   3.  ],
       [  1.  , 660.  ,   3.67,   3.  ],
       [  1.  , 800.  ,   4.  ,   1.  ],
       ...,
       [  0.  , 460.  ,   2.63,   2.  ],
       [  0.  , 700.  ,   3.65,   2.  ],
       [  0.  , 600.  ,   3.89,   3.  ]])

## 3. Selecting & Querying

In [18]:
# select specific columns
# df['admit']  # this generate pandas 'series', we don't use this 
df[['admit']]

Unnamed: 0,admit
0,0
1,1
2,1
3,1
4,0
...,...
395,0
396,0
397,0
398,0


In [19]:
df[['admit', 'gre']]

Unnamed: 0,admit,gre
0,0,380
1,1,660
2,1,800
3,1,640
4,0,520
...,...,...
395,0,620
396,0,560
397,0,460
398,0,700


In [20]:
# integer location
print(df.iloc[1])
print('-----------------')
print(df.iloc[1:3])
print('-----------------')
print(df.iloc[1:3, 1])
print('-----------------')
print(df.iloc[1:3, 1:3])

admit      1.00
gre      660.00
gpa        3.67
rank       3.00
Name: 1, dtype: float64
-----------------
   admit  gre   gpa  rank
1      1  660  3.67     3
2      1  800  4.00     1
-----------------
1    660
2    800
Name: gre, dtype: int64
-----------------
   gre   gpa
1  660  3.67
2  800  4.00


In [21]:
# location I
df.loc[1:3, ['gre', 'rank']]

Unnamed: 0,gre,rank
1,660,3
2,800,1
3,640,4


In [22]:
# location II
condition = df['gre'] > 780
df.loc[ condition, ['gre', 'rank'] ].head()

Unnamed: 0,gre,rank
2,800,1
10,800,4
18,800,2
25,800,1
33,800,3


In [23]:
# location III
condition = df['gre'].isin([710,720,730,740])
df.loc[ condition, ['gre', 'rank'] ].head()

Unnamed: 0,gre,rank
52,740,4
55,740,3
66,740,4
74,720,4
75,720,3


## 4. Modifying data frames

In [24]:
df_origin = df.copy()

In [26]:
print(df.head(1))

# modifying specific value
df.loc[0, 'gre'] = 780

print(df.head(1))

   admit  rank
0      0     3
   admit  rank    gre
0      0     3  780.0


In [25]:
df = df_origin
# drop specific columns
df = df.drop(['gre', 'gpa'], axis=1)
df.head()

Unnamed: 0,admit,rank
0,0,3
1,1,3
2,1,1
3,1,4
4,0,4


In [27]:
df = df_origin
# making a new column
df['gregpa'] = df['gre']/800 + df['gpa']/4
df.head()

Unnamed: 0,admit,gre,gpa,rank,gregpa
0,0,380,3.61,3,1.3775
1,1,660,3.67,3,1.7425
2,1,800,4.0,1,2.0
3,1,640,3.19,4,1.5975
4,0,520,2.93,4,1.3825


In [28]:
# Dummy variable

df = df_origin
df = pd.get_dummies(df, columns=['rank'])
print(df.head())

df = df_origin
df = pd.get_dummies(df, columns=['rank'], drop_first=True)
print(df.head())

   admit  gre   gpa  gregpa  rank_1  rank_2  rank_3  rank_4
0      0  380  3.61  1.3775       0       0       1       0
1      1  660  3.67  1.7425       0       0       1       0
2      1  800  4.00  2.0000       1       0       0       0
3      1  640  3.19  1.5975       0       0       0       1
4      0  520  2.93  1.3825       0       0       0       1
   admit  gre   gpa  gregpa  rank_2  rank_3  rank_4
0      0  380  3.61  1.3775       0       1       0
1      1  660  3.67  1.7425       0       1       0
2      1  800  4.00  2.0000       0       0       0
3      1  640  3.19  1.5975       0       0       1
4      0  520  2.93  1.3825       0       0       1


## 5. Group by

In [29]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [30]:
# group by operation
temp = df.groupby(by=['rank'], as_index = False)['gre'].mean()

print(temp)

temp.columns = ['rank', 'avg(gre)']

print(temp)

   rank         gre
0     1  611.803279
1     2  596.026490
2     3  574.876033
3     4  570.149254
   rank    avg(gre)
0     1  611.803279
1     2  596.026490
2     3  574.876033
3     4  570.149254


In [31]:
# group by operation
temp = df.groupby(by=['rank'], as_index = False)['gre', 'gpa'].mean()

temp.columns = ['rank', 'avg(gre)', 'avg(gpa)']

temp

  


Unnamed: 0,rank,avg(gre),avg(gpa)
0,1,611.803279,3.453115
1,2,596.02649,3.361656
2,3,574.876033,3.432893
3,4,570.149254,3.318358
