# A Numpy and Pandas introduction

## Numpy
Library for n-dimensional arrays, array-maths, etc.

Each array has a single data type.

In [1]:
import numpy as np

In [6]:
# 1 dimensional array
one_d_array = np.ones(shape=10)
print(one_d_array)
print(one_d_array.dtype)
print()
randint_one_d_array = np.random.randint(1,101, size=10)
print(randint_one_d_array)
print(randint_one_d_array.dtype)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
float64

[ 3 41 73 36 66 95 31  3 46 56]
int64


In [7]:
# 2 dimensional array
two_d_array = np.ones(shape=(5,5))
print(two_d_array)

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]


In [8]:
# Excurse: higher dimensional arrays
print('3D Array:\n' + np.array2string(np.ones(shape=(3,3,3))))
print()
print('4D Array:\n' + np.array2string(np.ones(shape=(3,3,3,3))))

3D Array:
[[[1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]]

 [[1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]]

 [[1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]]]

4D Array:
[[[[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]]


 [[[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]]


 [[[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]

  [[1. 1. 1.]
   [1. 1. 1.]
   [1. 1. 1.]]]]


In [10]:
reshape_me = np.ones(shape=15)
print(f'Original:\n{np.array2string(reshape_me)}')
print(f'Original shape: {reshape_me.shape}')
print()

# both dimensions provided
reshaped = reshape_me.reshape((3,5))
print(f'1st reshape:\n{np.array2string(reshaped)}')
print(f'1st reshape shape: {reshaped.shape}')
print()

# figure out the second dimension on your own
reshaped = reshape_me.reshape((3,-1))
print(f'2nd reshape:\n{np.array2string(reshaped)}')
print(f'2nd reshape shape: {reshaped.shape}')
print()

# figure out the first dimension on your own
reshaped = reshape_me.reshape((-1,5))
print(f'3rd reshape:\n{np.array2string(reshaped)}')
print(f'3rd reshape shape: {reshaped.shape}')

Original:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Original shape: (15,)

Original:
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
Original shape: (1, 15)

1st reshape:
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
1st reshape shape: (3, 5)

2nd reshape:
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
2nd reshape shape: (3, 5)

3rd reshape:
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
3rd reshape shape: (3, 5)


## Pandas
Library for processing tabular data

In [11]:
import pandas as pd

### Pandas Series
- almost like Numpy array
- has single data type
- has name
- has labels 'instead of' indexes

In [12]:
pandas_series = pd.Series(range(5), index=['Luke', 'Paul', 'Gustav', 'Ellen', 'Zack'], name='importance')
print(pandas_series)

Luke      0
Paul      1
Gustav    2
Ellen     3
Zack      4
Name: importance, dtype: int64


### Pandas DataFrame
- 'horizontal stacking' of multiple Series
- all Series share the same labels
- each Series has its own datatype
    - entire DataFrame can contain data of different types

In [13]:
pandas_dataframe = pd.DataFrame(
    np.random.randint(1,11, size=(5,7)),
    index=['Luke', 'Paul', 'Gustav', 'Ellen', 'Zack'],
    columns=['a', 'b', 'c', 'd', 'e', 'f', 'g']
)
print(pandas_dataframe)
print()
print(pandas_dataframe.dtypes)

        a  b  c  d   e  f  g
Luke    6  3  5  8   2  6  3
Paul    9  3  6  4  10  7  9
Gustav  8  6  6  8   6  4  4
Ellen   4  3  4  7   8  6  1
Zack    7  1  9  8   3  7  9

a    int64
b    int64
c    int64
d    int64
e    int64
f    int64
g    int64
dtype: object


In [14]:
# selecting single column returns Series
selected_series = pandas_dataframe['a']
print(selected_series)
print(type(selected_series))

Luke      6
Paul      9
Gustav    8
Ellen     4
Zack      7
Name: a, dtype: int64
<class 'pandas.core.series.Series'>


In [22]:
# when selecting data for transformation express desire to select all rows
selected_series = pandas_dataframe.loc[:, 'a']
print(selected_series)
print(type(selected_series))

Luke      6
Paul      9
Gustav    8
Ellen     4
Zack      7
Name: a, dtype: int64
<class 'pandas.core.series.Series'>


In [27]:
# selecting multiple columns returns DataFrame
selected_df = pandas_dataframe[['a', 'b']]
print(selected_df)
print(type(selected_df))

        a  b
Luke    6  3
Paul    9  3
Gustav  8  6
Ellen   4  3
Zack    7  1
<class 'pandas.core.frame.DataFrame'>


In [30]:
# replace data of column
pandas_dataframe['c'] = np.ones(5)
print(pandas_dataframe)
print()
print(pandas_dataframe.dtypes)

        a  b    c  d   e  f  g
Luke    6  3  1.0  8   2  6  3
Paul    9  3  1.0  4  10  7  9
Gustav  8  6  1.0  8   6  4  4
Ellen   4  3  1.0  7   8  6  1
Zack    7  1  1.0  8   3  7  9

a      int64
b      int64
c    float64
d      int64
e      int64
f      int64
g      int64
dtype: object
