<a href="https://colab.research.google.com/github/JaiswalFelipe/Learning-Python/blob/main/DataManipulationWithNumPy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Dealing with missing values**

In [None]:
import numpy as np

In [None]:
# This has no missing vals, and is for testing np.isnan(). Hint, np.loadtxt should have thrown an error
lending_co_data_numeric = np.loadtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data.csv", 
                                    delimiter = ",")

In [None]:
# This HAS missing vals
lending_co_data_numeric_NAN = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data-NAN.csv",
                                            delimiter = ";")

In [None]:
# Fixed
# Rerun 2nd cell
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data-NAN.csv",
                                            delimiter = ";",
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
  lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                              temporary_mean[i],
                                              lending_co_data_numeric_NAN[:,i])

lending_co_data_numeric_NAN


array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

##### Checking for missing values
- Nan == Not a Number

In [None]:
# Without .sum, np.isnan() will return an array of True/False values
print(np.isnan(lending_co_data_numeric).sum())
print("")

# Now this has the NaNs
print(np.isnan(lending_co_data_numeric_NAN).sum())

0

1043


In [None]:
# Fill missing values with 0 during import:  "filling_values = 0"
# WARNING! THIS IS NOT ALWAYS A GREAT IDEA
lending_co_data_numeric_NAN = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data-NAN.csv",
                                            delimiter = ";",
                                            filling_values = 0)
# Now to test
print(np.isnan(lending_co_data_numeric_NAN).sum())

In [None]:
# Another way is to fill the missing values with a number "GREATER THAN" the highest value of the dataset
# To get the highest value: np.nanmax()
# To test, rerun above cell without "filling_values = 0"
# .round() is a good practice when working with decimal numbers
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1 
temporary_fill

64002.0

In [None]:
# Now fill with "temporary_fill"
lending_co_data_numeric_NAN = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data-NAN.csv",
                                            delimiter = ";",
                                            filling_values = temporary_fill)
# Now to test
print(np.isnan(lending_co_data_numeric_NAN).sum())

0


##### Substituting Missing Values in Ndarrays
- np.nanmean()



In [None]:
# Filling with "mean": won't change the overall interpretation of the dataset
# All missing values would be considered avg 
# NOTE: NOT ALWAYS VALID, BUT OFTEN IS THE PREFERRED APPROACH

#Find out the mean, and set it to a variable
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

# Check the mean of the first column
temporary_mean[0] 

2250.25

In [None]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1

lending_co_data_numeric_NAN = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data-NAN.csv",
                                            delimiter = ";",
                                            filling_values = temporary_fill)

# This value is much greater than the mean of the first column and this makes the filler an OUTLIER
temporary_fill

64002.0

In [None]:
# Next, find the avg value of the first column of the dataset WITHOUT missing values
# Slicing to indicated we're only interested in the column with index 0
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

# This mean is distorted due to filling NANs with the "maximum value + 1"
# The mean of the first column prior to filling it with "maximum value + 1" was "temporary_mean[0] == 2250.25"
# Twice smaller the mean when missing with maximum, and can lead to MISLEADING insights from da data

4263.25

In [None]:
# To fix this, we use np.where()
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,  # If True:
                                            temporary_mean[0],                                      # Set to this
                                            lending_co_data_numeric_NAN[:,0])                    # If not: set it equal to itself

# Now, every "non-filler" value will remain unchanged,
# And every "filler" will now contain the mean of the column instead

# Now to check
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

2250.25

In [None]:
# Now we can generalize it to "ALL" the columns of the dataset

# Call "range" function and ".shape" attribute to get the number of columns in the dataset
# [:,0] --> [:,i] we changed 0 into "i(s)" to make sure we iterate over "each column" 
for i in range(lending_co_data_numeric_NAN.shape[1]):  # shape[1] because "1" gives us the number of columns, "0" == rows                                           
  lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,  # If true, change to: 
                                            temporary_mean[i],                                     # the mean of each column                       
                                            lending_co_data_numeric_NAN[:,i])                      # If non-filler, don't change

In [None]:
# To replace "Negative values" to "0"
for i in range(lending_co_data_numeric_NAN.shape[1]):                                          
  lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] < 0,  # If less than 0 (negative)
                                            0,                       # Change to 0                     
                                            lending_co_data_numeric_NAN[:,i])

##### Reshaping Ndarrays
- np.reshape() NOT ALWAYS NEEDED
- The act of morphing the shape of an object a certain way
- INSTEAD: Use np.transpose()

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# Before reshape
lending_co_data_numeric.shape

(1043, 6)

In [None]:
np.reshape(lending_co_data_numeric, (6,1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [None]:
# Preferred: Transpose "rows" into "Columns"
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

##### Removing Values from Ndarrays
- np.delete()

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# Delete the first value of the flattened array
print(np.delete(lending_co_data_numeric, 0))
print("")

# To verify:
print(np.delete(lending_co_data_numeric, 0).shape)
print("")

# Note: This is the original
print(lending_co_data_numeric.size)

[   40.   365.  3121. ...  4601.  4601. 16600.]

(6257,)

6258


In [None]:
# Get rid of entire columns or rows: by passing a value to the axis argument
# Remember 0 indexing

# This removes the first "row" of the matrix: axis = 0
print(np.delete(lending_co_data_numeric, 0, axis = 0))
print("")

# This removes the first "column" of the matrix: axis = 1
print(np.delete(lending_co_data_numeric, 0, axis = 1))

[[ 2000.    40.   365.  3061.  4171. 15041.]
 [ 1000.    40.   365.  2160.  3280. 15340.]
 [ 2000.    40.   365.  3041.  4241. 15321.]
 ...
 [ 2000.    40.   365.  4201.  5001. 16600.]
 [ 1000.    40.   365.  2080.  3320. 15600.]
 [ 2000.    40.   365.  4601.  4601. 16600.]]

[[   40.   365.  3121.  4241. 13621.]
 [   40.   365.  3061.  4171. 15041.]
 [   40.   365.  2160.  3280. 15340.]
 ...
 [   40.   365.  4201.  5001. 16600.]
 [   40.   365.  2080.  3320. 15600.]
 [   40.   365.  4601.  4601. 16600.]]


In [None]:
# Removing multiple rows/columns by using a tuple or a list

# Remove the 1st, 3rd, 5th columns
np.delete(lending_co_data_numeric, (0,2,4), axis = 1)

# Delete both rows and columns simultaneously:
#np.delete(np.delete(lending_co_data_numeric, [0,2,4], axis = 1), [0,2,-1], axis = 0)

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

##### Sorting Ndarrays
- np.sort() takes an array and returns a sorted version (in ascending order)

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# The default sort: without specifying any other arguments
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [None]:
# Results an array where all its columns sorted in an ascending order
np.sort(lending_co_data_numeric, axis = 0)

array([[ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.8700e+03, -2.8700e+03,
        -3.5000e+02],
       [ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.5500e+03, -2.1000e+03,
         1.5000e+02],
       [ 1.0000e+03,  3.5000e+01,  3.6500e+02, -2.4500e+03, -2.0000e+03,
         1.1000e+03],
       ...,
       [ 9.0000e+03,  1.2500e+02,  3.6500e+02,  1.6751e+04,  1.8751e+04,
         5.4625e+04],
       [ 9.0000e+03,  1.6500e+02,  3.6500e+02,  1.7650e+04,  2.0001e+04,
         5.4625e+04],
       [ 9.0000e+03,  1.6500e+02,  3.6500e+02,  1.9001e+04,  2.2001e+04,
         6.4001e+04]])

In [None]:
# Get rid of scientific notation
# WARNING: THESE SETTINGS WILL APPLY TO OUR ENTIRE WORK, rather than a single cell

#np.set_printoptions(suppress = True)
# Then rerun above cell to check

In [None]:
# Descending: adding "-" changes the sign of every individual element of the input variable
# Equivalent to multiplying by -1
# But this will make the sign negative
np.sort(-lending_co_data_numeric)

# To fix this: use "-" before -np.sort()
#-np.sort(-lending_co_data_numeric)

array([[-13621.,  -4241.,  -3121.,  -2000.,   -365.,    -40.],
       [-15041.,  -4171.,  -3061.,  -2000.,   -365.,    -40.],
       [-15340.,  -3280.,  -2160.,  -1000.,   -365.,    -40.],
       ...,
       [-16600.,  -5001.,  -4201.,  -2000.,   -365.,    -40.],
       [-15600.,  -3320.,  -2080.,  -1000.,   -365.,    -40.],
       [-16600.,  -4601.,  -4601.,  -2000.,   -365.,    -40.]])

In [None]:
# Sort individual columns by indexing
# Sort the 4th column
np.sort(lending_co_data_numeric[:,3])

# This overwrites the original array's "4th" column
#lending_co_data_numeric[:,3].sort()

# Or "ENTIRE" matrix
#lending_co_data_numeric.sort(axis = 0)

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

##### Argument Sort in NumPy
- np.argsort()

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# Returns the sorted INDICES
np.argsort(lending_co_data_numeric)

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]])

In [None]:
# The first column
np.argsort(lending_co_data_numeric[:,0])

# Set into a variable
#lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
#lending_co_data_numeric

# This can be "extremely" useful when each row of a dataset contains informatuon about a specific client(or date)

array([537, 639, 849, ...,  27, 277, 420])

##### Argument Where in NumPy
- np.argwhere() can be used with any mathematical conditions
- We can use this function to separate the elements that interests us



- WHERE DID THE NAME "ARGUMENT" ORIGINATES FROM -->  ASTRONOMY
 - "That which elucidates something else"
 - COORDINATES IN SPACE == Indices in N-D arrays
- Therefore, functions that return "coordinates" within an array are called "argument" functions

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# np.argwhere goes over the entire NDarray and checks whether the individual elements satisfy a given condition
# The outputs are the indices for all the indivudual elements where the condition is met
# The default condition is to check for values different from 0
np.argwhere(lending_co_data_numeric)

# The first column == row index, the 2nd column == column index

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

In [None]:
# So to find out how many are equal to "0" in the dataset
np.argwhere(lending_co_data_numeric == False)

array([[116,   4],
       [430,   3]])

In [None]:
# To verify:
print(lending_co_data_numeric[116])
print(lending_co_data_numeric[430])

[ 1000.    50.   365. -1450.     0. 13850.]
[1000.   50.  365.    0.  550. 5650.]


In [None]:
# WHY IS THIS VERY HANDY???
#np.argwhere(lending_co_data_numeric > 1000)
#np.argwhere(lending_co_data_numeric %2 == 0)    # Is even

In [None]:
# Synergy between np.argwhere() + np.isnan()
# Rerun 2nd column with the missing values lending_co_data_numeric_NAN
lending_co_data_numeric_NAN

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

# Same technique to verify

In [None]:
# A way to fill the NANS
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
  lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

In [None]:
# To check
np.isnan(lending_co_data_numeric_NAN).sum()

0

##### Shuffling Ndarrays
- np.random.shuffle() takes an ndarray and shuffles it in place
- Rearranging the parts of a dataset
- Without a fixed pattern
- End goal is that a random sample would be representative of the entire dataset

In [None]:
# Rerun first import with the np.loadtxt()

# For the initial part, we use only the first 8 rows
first_8 = lending_co_data_numeric[:8]
first_8

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [None]:
# np.random.shuffle() takes an ndarray and shuffles it in place
# It only saves the shuffled array over the original one, and outputs nothing therefore:
np.random.shuffle(first_8)
first_8

# Every run will output a different outcome

array([[ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [None]:
# When using the same module multiple times, import it directly
from numpy.random import shuffle
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

In [None]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    50.,   365.,  1651.,  3051., 20250.],
       [ 4000.,    50.,   365.,  5450.,  7000., 20750.],
       [ 1000.,    50.,   365.,   800.,  2200., 17550.],
       ...,
       [ 4000.,    50.,   365.,  5400.,  6900., 22250.],
       [ 1000.,    40.,   365.,  3676.,  4836., 15600.],
       [ 2000.,    40.,   365.,  3418.,  4618., 16600.]])

In [None]:
# WOMBO X COMBO 
array_RG = gen(pcg())
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    50.,   365.,  3351.,  5051., 16551.],
       [ 4000.,    50.,   365.,  5453.,  5623., 13443.],
       [ 2000.,    40.,   365.,  3401.,  4601., 16600.],
       ...,
       [ 4000.,    50.,   365.,  8960., 12950., 22250.],
       [ 2000.,    40.,   365.,  3400.,  4350., 15010.],
       [ 2000.,    50.,   365.,  4251.,  4951., 20250.]])

##### Casting Ndarrays
- .astype() == Short for assign type
- Taking an object with values of a certain datatype and creating an identical object that contains values of a *different datatype*
- In Numpy: Creating a new array that stores the values of the original array under a different type
- I.E. Float <---> Int
- Additional: CHAIN METHOD

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# .astype()
lending_co_data_numeric.astype(dtype = np.int32)

# When changing from str --> int, DO: str --> float --> int
# Do this by overwriting the original dataset --> float, THEN cast to --> int: astype(dtype = np.int32)

# CHAIN METHOD:
# Assuming, we imported as str-->
# lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32)

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

##### Striping Values from Ndarrays
- np.chararray.strip()
- Removing specific parts of strings (excess strings)

In [None]:
# Rerun first import with the np.loadtxt()
lending_co_total_price = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Total-Price.csv",
                                       delimiter = ',',
                                       dtype = np.str,   # Remember
                                       skip_header = 1,
                                       usecols = [1,2,4])
lending_co_total_price

array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

In [None]:
# Remove "id" part on the first column
#np.chararray.strip(lending_co_total_price[:,0], "id_")
# This does not overwrite the original, SO:

lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")
lending_co_total_price

array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

In [None]:
# We can also apply np.where() to transform the letters in the second column into numeric values
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1])
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1])
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1])
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1])
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1])
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1])

lending_co_total_price

array([['1', '2', '2'],
       ['2', '2', '3'],
       ['3', '3', '5'],
       ...,
       ['413', '2', '135'],
       ['414', '3', '200'],
       ['415', '1', '8']], dtype='<U12')

In [None]:
# These are still str, SO CAST TO INT
lending_co_total_price = lending_co_total_price.astype(dtype = np.int32)
lending_co_total_price

array([[  1,   2,   2],
       [  2,   2,   3],
       [  3,   3,   5],
       ...,
       [413,   2, 135],
       [414,   3, 200],
       [415,   1,   8]], dtype=int32)

##### Stacking Ndarrays
- np.stack() 
- np.vstack() = vertical stack: stacks 2-D arrays vertically
- np.hstack() = horizontal stack
- np.dstack() = depth stack: stacks arrays in the 3rd dimension

- Stacking: Placing multiple objects on top of one another to create a bigger object
- Arrays **MUST** be the same shape

In [None]:
# Rerun: without missing vals and with

# Without
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# With
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("/content/drive/MyDrive/Colab Notebooks/365 Data Science/Programming/Numpy/Files for NumpyPart2/Importing and Saving Data with Numpy/Lending-Company-Numeric-Data-NAN.csv",
                                            delimiter = ";",
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
  lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                              temporary_mean[i],
                                              lending_co_data_numeric_NAN[:,i])

lending_co_data_numeric_NAN


array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [None]:
# np.stack() Whatever array you put first, it will be the first and followed by the second and so on
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1]))

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [None]:
# You can also stack them side by side using: axis = 1
# Also, you can stack multuple columns
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis = 1)

array([[2000.,   40.,  365.],
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       ...,
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       [2000.,   40.,  365.]])

In [None]:
# np.vstack() vertical stack: stacks 2-D arrays vertically
# Places the first array on top of the second one
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

# Now rows have doubled

(2086, 6)

In [None]:
# np.hstack horizontal stack
np.hstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

# Now columns have doubled

(1043, 12)

In [None]:
# np.dstack() = depth stack: stacks arrays in the 3rd dimension
# Returns an array of a higher dimension
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

#You can also do slicing here:
#np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0]

# 1043 6x2 arrays

(1043, 6, 2)

##### Concatenating Ndarrays
- Linking together objects in a chain
- Creating a new (larger) array by merging existing smaller arrays along a given axis
- The inputs and the outputs of the np.concatenate() function always have the same number of dimensions
- Concatenating in 1-D does not reqire the inputs to have the same shape (UNLIKE STACKING)
- But their dimensions **MUST** match

In [None]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# Concatinating the first row and the 2nd row
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]))

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [None]:
# Concat 2D arrays
# Rerun top 2nd and 3rd cell
# This is similar to the np.vstack()
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

(2086, 6)

In [None]:
# Similar to np.hstack()
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1).shape

(1043, 12)

In [None]:
# 3D example
array_ex_1 = np.array([[[1,2,3,4,5], [1,2,7,4,0], [0,2,6,4,9], [6,2,5,4,3], [6,2,3,5,8]]])
array_ex_2 = array_ex_1 * 2 

In [None]:
# Same as np.vstack()
np.concatenate((array_ex_1, array_ex_2), axis = 0) 

array([[[ 1,  2,  3,  4,  5],
        [ 1,  2,  7,  4,  0],
        [ 0,  2,  6,  4,  9],
        [ 6,  2,  5,  4,  3],
        [ 6,  2,  3,  5,  8]],

       [[ 2,  4,  6,  8, 10],
        [ 2,  4, 14,  8,  0],
        [ 0,  4, 12,  8, 18],
        [12,  4, 10,  8,  6],
        [12,  4,  6, 10, 16]]])

In [None]:
# Same as np.hstack()
np.concatenate((array_ex_1, array_ex_2), axis = 1)

array([[[ 1,  2,  3,  4,  5],
        [ 1,  2,  7,  4,  0],
        [ 0,  2,  6,  4,  9],
        [ 6,  2,  5,  4,  3],
        [ 6,  2,  3,  5,  8],
        [ 2,  4,  6,  8, 10],
        [ 2,  4, 14,  8,  0],
        [ 0,  4, 12,  8, 18],
        [12,  4, 10,  8,  6],
        [12,  4,  6, 10, 16]]])

In [None]:
# Same as np.dstack()
np.concatenate((array_ex_1, array_ex_2), axis = 2)

array([[[ 1,  2,  3,  4,  5,  2,  4,  6,  8, 10],
        [ 1,  2,  7,  4,  0,  2,  4, 14,  8,  0],
        [ 0,  2,  6,  4,  9,  0,  4, 12,  8, 18],
        [ 6,  2,  5,  4,  3, 12,  4, 10,  8,  6],
        [ 6,  2,  3,  5,  8, 12,  4,  6, 10, 16]]])

##### Finding Unique Vaules in Ndarrays
- np.unique() takes an array as an input and creates another array that contains all the different values from the first one 

In [None]:
# Rerun first cell
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [None]:
# Entire dataset
np.unique(lending_co_data_numeric)

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

In [None]:
# With indexing
np.unique(lending_co_data_numeric[:,1])

# When "numeric" = ascending order
# When "non-numeric" = Alphabetical order

array([ 35.,  40.,  50., 125., 165.])

In [None]:
# Array 1 = Unique values
# Array 2 = Indices
# Array 3 = Their frequencies
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27]),
 array([  4, 567, 451,  19,   2]))