<span style="color:#bce35b;font-size:35px"><b>Python for Data Analysis</b></span>

<span style="color:#8c8c8c">Patrick Weatherford</span>

[Green]: <> (#bce35b)
[Purple]: <> (#ae8bd5)
[Coral]: <> (#9c4957)
[Grey]: <> (#8c8c8c)

***
***

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets


In [2]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


<br>

***

<br>

# <span style="color:#bce35b">NumPy</span>

Package used to efficiently handle single or multidemensional arrays.
- <u>ndarray</u>: Array object for NumPy
- Can perform whole block calculations vs. operations between scalar elements.

## <span style="color:#ae8bd5">Array vs. List Efficiency</span>

In [3]:
my_arr = np.arange(1000000)
my_list = list(range(1000000))

In [4]:
%time for _ in range(10): my_arr2 = my_arr * 2

Wall time: 21.2 ms


In [5]:
%time for _ in range(10): my_list2 = [x * 2 for x in my_list]

Wall time: 869 ms


<br>

## <span style="color:#ae8bd5">Basic ndarray object</span>

<img src="Array Creation.png" width="400">

Single Dimension

In [6]:
## generating random array object
rng = np.random.default_rng(seed=777)
rand_array = rng.random(size=10)

print(f"""
Array: {rand_array}
Shape (r, c): {rand_array.shape}
Data Type: {rand_array.dtype}
# Dimenstions: {rand_array.ndim}
""")



Array: [0.61109393 0.38281659 0.60007053 0.96355787 0.19616256 0.33704142
 0.56847498 0.5395921  0.73604843 0.4308188 ]
Shape (r, c): (10,)
Data Type: float64
# Dimenstions: 1



<br>
Multi-Dimensional

In [7]:
nested_list = [[1,2,3],[4,5,6],[7,8,9],[10,11,12]]
rand_array = np.asarray(nested_list)

print(f"""
Array: {rand_array}
Shape (r, c): {rand_array.shape}
Data Type: {rand_array.dtype}
# Dimenstions: {rand_array.ndim}
""")


Array: [[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
Shape (r, c): (4, 3)
Data Type: int32
# Dimenstions: 2



<br>

## <span style="color:#ae8bd5">Array with Predefined Size</span>

Useful when you need to loop scalar elements and populate calculated values into an array. Instead of dropping/re-creating the array which is not efficient, you can pre-define the array and then replace index value of array with the scalar element.

In [8]:
my_list = [1,2,3,4]  # define list to iterate into existing array
n = len(my_list)  # get size of the list to pre-define array shape

my_array = np.zeros(shape=(n,)) # create array with pre-defined shape filled with zeros

for i, val in enumerate(my_list):  # iterate list replacing values at array index 'i' with list values at index 'i'
    my_array[i] = val
    
my_array

array([1., 2., 3., 4.])

<br>

## <span style="color:#ae8bd5">Array Slicing</span>

array[axis_0_slice, axis_1_slice, axis_2_slice, ..., axis_n_slice]

<img src="Array Slicing.png" width="300">

*2 dimensional array example*

In [9]:
my_array = np.asarray([[1,2,3],[4,5,6],[7,8,9]])

print(f"""
Array: 
{my_array}

Axis[:,0]: All in axis 0, i=0 in axis 1
{my_array[:, 0]}

Axis[0, :]: All in axis 1, i=0 in axis 0
{my_array[0, :]}

Axis[2, :]: All in axis 1, i=2 in axis 1
{my_array[2,:]}

Axis[:, 2]: All in axis 0, i=2 in axis 0
{my_array[:,2]}
""")



Array: 
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Axis[:,0]: All in axis 0, i=0 in axis 1
[1 4 7]

Axis[0, :]: All in axis 1, i=0 in axis 0
[1 2 3]

Axis[2, :]: All in axis 1, i=2 in axis 1
[7 8 9]

Axis[:, 2]: All in axis 0, i=2 in axis 0
[3 6 9]



3-Dimensional Array Example

In [10]:
array_3d = np.asarray([
    [ [1,2], [3,4] ]
    ,[ [5,6], [7,8] ]
] )

array_3d, array_3d.shape

(array([[[1, 2],
         [3, 4]],
 
        [[5, 6],
         [7, 8]]]),
 (2, 2, 2))

In [11]:
array_3d[:, :, 0]

array([[1, 3],
       [5, 7]])

<br>

## <span style="color:#ae8bd5">Array Indexing</span>

In [12]:
array_2d = np.asarray([[1,2,3],[4,5,6],[7,8,9]])

print(f"""
Array:
{array_2d}

Boolean Condition mask: <= 4
{array_2d <= 4}

Boolean Applied to Array:
{array_2d[array_2d <= 4]}

Boolean Applied to Array with multiple conditions:
{array_2d[(array_2d <= 4) | (array_2d >= 8)]}

Boolean Applied with np.where() function to retain dimensions: 0 if <= 4 else value
{np.where(array_2d <= 4, 0, array_2d)}

Boolean Applied with np.where() function for multiple conditions and retain dimensions:
{np.where( (array_2d <= 4) | (array_2d >= 8), 0, array_2d )}
""")


Array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Boolean Condition mask: <= 4
[[ True  True  True]
 [ True False False]
 [False False False]]

Boolean Applied to Array:
[1 2 3 4]

Boolean Applied to Array with multiple conditions:
[1 2 3 4 8 9]

Boolean Applied with np.where() function to retain dimensions: 0 if <= 4 else value
[[0 0 0]
 [0 5 6]
 [7 8 9]]

Boolean Applied with np.where() function for multiple conditions and retain dimensions:
[[0 0 0]
 [0 5 6]
 [7 0 0]]



## <span style="color:#ae8bd5">Hypothesis Testing</span>

In [13]:
nwalks = 5000
nsteps = 1000 

draws = np.random.randint(low=0, high=2, size=(nwalks, nsteps))

steps = np.where(draws > 0, 1, -1)

walks = steps.cumsum(axis=1)

hits30 = (np.abs(walks)>=30).any(axis=1)

hits30.sum()


3373

<br><br>

# <span style="color:#bce35b">Pandas</span>

## <span style="color:#ae8bd5">Basics</span>

In [54]:
## creating a Series
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [55]:
## creating Series with index
index = ['a','b','c']
pd.Series(data=[1,2,3], index=index)

a    1
b    2
c    3
dtype: int64

In [56]:
## creating Series with column/index names
index = ['a','b','c']
pd.Series(data=[1,2,3], index=index, name='Char')

a    1
b    2
c    3
Name: Char, dtype: int64

In [57]:
## creating DataFrame 
pd.DataFrame([1,2,3])

Unnamed: 0,0
0,1
1,2
2,3


In [58]:
## creating DataFrame with specific index 
pd.DataFrame([1,2,3], index=['a','b','c'])

Unnamed: 0,0
a,1
b,2
c,3


In [63]:
## creating DataFrame with column/index names
index = ['a','b','c']
df = pd.DataFrame({'Values':[1,2,3],'Type':['Yo','Blah','Blarg']}, index=index)
df.index.name = 'Char'
df

Unnamed: 0_level_0,Values,Type
Char,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,Yo
b,2,Blah
c,3,Blarg


In [64]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica
