# Python Libraries 1

_May 13, 2020_ 

Agenda today:
- Introduction to Numpy: array math
- Introduction to Pandas: importing, indexing, and math

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Part I. Numpy
The basic data structure that exist in numpy is called numpy arrays. NP arrays are very similar to python lists. The __difference__ between a python list and a Numpy array is that list can only a mix of data types but array can only contain the same data type.

But what is the benefits of using NumPy array instead of the base python lists?
- Speed 
- Broadcasting Property

In [2]:
names_list=['Bob','John','Sally']
names_array=np.char.array(['Bob','John','Sally']) #use numpy.array for numbers and numpy.char.array for strings
print(names_list)
print(names_array)

['Bob', 'John', 'Sally']
['Bob' 'John' 'Sally']


In [3]:
import time

size_of_seq = 100000

def pure_python_version():
    tic = time.time()
    X = range(size_of_seq)
    Y = range(size_of_seq)
    Z = [X[i] + Y[i] for i in range(len(X)) ]
    toc = time.time()
    return toc - tic

def numpy_version():
    tic = time.time()
    X = np.arange(size_of_seq)
    Y = np.arange(size_of_seq)
    Z = X + Y 
    toc = time.time()
    return toc - tic


t1 = pure_python_version()
t2 = numpy_version()
print("python: " + str(t1), "numpy: "+ str(t2))
print("Numpy is in this example " + str(t1/t2) + " times faster!")

python: 0.04297184944152832 numpy: 0.0018038749694824219
Numpy is in this example 23.821966693100713 times faster!


In [4]:
## broadcasting and array math

np.array([2,3,4,6]) * 5

array([10, 15, 20, 30])

In [5]:
li = [2,3,4,6]
li * 5

[2, 3, 4, 6, 2, 3, 4, 6, 2, 3, 4, 6, 2, 3, 4, 6, 2, 3, 4, 6]

In [8]:
## simulation with numpy - in normal distribution 


There are many other wonderous things numpy can do, you will encounter them later in the course of the program. 

## Part II. Pandas
Pandas stand for paneled data, and it is the most popular library for data scientists to manipulate, clean, and organize dataset in Python. The most fundamental data structure that exists in Pandas is called **DataFrames**. 

In [9]:
## importing data and look at optional parameters
df = pd.read_csv('auto-mpg.csv')

In [10]:
# examine and read the data
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [11]:
# examine the information in this dataframe
df.info

<bound method DataFrame.info of       mpg  cylinders  displacement horsepower  weight  acceleration  \
0    18.0          8         307.0        130    3504          12.0   
1    15.0          8         350.0        165    3693          11.5   
2    18.0          8         318.0        150    3436          11.0   
3    16.0          8         304.0        150    3433          12.0   
4    17.0          8         302.0        140    3449          10.5   
..    ...        ...           ...        ...     ...           ...   
393  27.0          4         140.0         86    2790          15.6   
394  44.0          4          97.0         52    2130          24.6   
395  32.0          4         135.0         84    2295          11.6   
396  28.0          4         120.0         79    2625          18.6   
397  31.0          4         119.0         82    2720          19.4   

     model year  origin                   car name  
0            70       1  chevrolet chevelle malibu  
1        

In [None]:
# examine the datatypes of the dataframe
type

In [13]:
# talk about series and dataframe 
type(df.weight)

pandas.core.series.Series

In [16]:
# indexing and subsetting 
df.head()
# loc vs iloc

# just 1 column
df.horsepower
# multiple columns
df[['cylinders', 'horsepower']]

Unnamed: 0,cylinders,horsepower
0,8,130
1,8,165
2,8,150
3,8,150
4,8,140
...,...,...
393,4,86
394,4,52
395,4,84
396,4,79


In [22]:
# examine whether we have missing value - it could really affect the data!
df.head()
# iloc = index by value of row and columns
# loc = index by name of the column
df.iloc[3,2] 
# group of specific values
df.iloc[2:5,3:5]
# only want weight > 4000
df['weight'] > 4000
# only want weight > 4000, horsepower < 150
df[(df.weight > 400) & (df.horsepower < 150)]

0      False
1      False
2      False
3      False
4      False
       ...  
393    False
394    False
395    False
396    False
397    False
Name: weight, Length: 398, dtype: bool

In [24]:
# but why do we still have anomaly in our data?

# try converting the datatype!
df.sort_values(by = 'horsepower', ascending = False)
# convert ? to 0
df.applymap(lambda x:0 if x == '?' else x)
df.sort_values(by = 'horsepower', ascending = False)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
...,...,...,...,...,...,...,...,...,...
98,16.0,6,250.0,100,3278,18.0,73,1,chevrolet nova custom
99,18.0,6,232.0,100,2945,16.0,73,1,amc hornet
107,18.0,6,232.0,100,2789,15.0,73,1,amc gremlin
127,19.0,6,232.0,100,2901,16.0,74,1,amc hornet


In [None]:
# how should we get rid of the anomaly in this case?

In [None]:
# exercise - get the cars that are heavier than 3000 but has a horsepower less than 150 
