# Python Libraries 1

_Aug 5, 2020_ 

Agenda today:
- Introduction to Numpy: array math
- Introduction to Pandas: importing, indexing, and math

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Part I. Numpy
The basic data structure that exist in numpy is called numpy arrays. NP arrays are very similar to python lists. The __difference__ between a python list and a Numpy array is that list can only a mix of data types but array can only contain the same data type.

But what is the benefits of using NumPy array instead of the base python lists?
- Speed 
- Broadcasting Property

In [2]:
names_list=['Bob','John','Sally']
names_array=np.char.array(['Bob','John','Sally']) #use numpy.array for numbers and numpy.char.array for strings
print(names_list)
print(names_array)

['Bob', 'John', 'Sally']
['Bob' 'John' 'Sally']


In [3]:
import time

size_of_seq = 100000

def pure_python_version():
    tic = time.time()
    X = range(size_of_seq)
    Y = range(size_of_seq)
    Z = [X[i] + Y[i] for i in range(len(X)) ]
    toc = time.time()
    return toc - tic

def numpy_version():
    tic = time.time()
    X = np.arange(size_of_seq)
    Y = np.arange(size_of_seq)
    Z = X + Y 
    toc = time.time()
    return toc - tic


t1 = pure_python_version()
t2 = numpy_version()
print("python: " + str(t1), "numpy: "+ str(t2))
print("Numpy is in this example " + str(t1/t2) + " times faster!")

python: 0.027966022491455078 numpy: 0.0007822513580322266
Numpy is in this example 35.750685766534595 times faster!


In [4]:
## broadcasting and array math

np.array([2,3,4,6]) * 5

array([10, 15, 20, 30])

In [5]:
li = [2,3,4,6]
li * 5

[2, 3, 4, 6, 2, 3, 4, 6, 2, 3, 4, 6, 2, 3, 4, 6, 2, 3, 4, 6]

In [None]:
## simulation with numpy - in normal distribution 

There are many other wonderous things numpy can do, you will encounter them later in the course of the program. 

## Part II. Pandas
Pandas stand for paneled data, and it is the most popular library for data scientists to manipulate, clean, and organize dataset in Python. The most fundamental data structure that exists in Pandas is called **DataFrames**. 

In [39]:
## importing data and look at optional parameters
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [40]:
# examine and read the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [None]:
# examine the information in this dataframe


In [None]:
# examine the datatypes of the dataframe

In [None]:
# talk about series and dataframe 

In [None]:
# indexing and subsetting 


In [41]:
# examine whether we have missing value - it could really affect the data!
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [42]:
df.drop(df.loc[df['horsepower']=='?'].index, inplace=True)

In [54]:
df.horsepower = df['horsepower'].apply(lambda x: int(x))

In [59]:
# exercise - get the cars that are heavier than 3000 but has a horsepower less than 150 

# should be 150 cars that satisfy the criteria 

df[(df['weight']>3000)&(df['horsepower']<150)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
34,16.0,6,225.0,105,3439,15.5,71,1,plymouth satellite custom
35,17.0,6,250.0,100,3329,15.5,71,1,chevrolet chevelle malibu
36,19.0,6,250.0,88,3302,15.5,71,1,ford torino 500
...,...,...,...,...,...,...,...,...,...
363,22.4,6,231.0,110,3415,15.8,81,1,buick century
364,26.6,8,350.0,105,3725,19.0,81,1,oldsmobile cutlass ls
365,20.2,6,200.0,88,3060,17.1,81,1,ford granada gl
366,17.6,6,225.0,85,3465,16.6,81,1,chrysler lebaron salon


In [None]:
# why do we seem to have anomaly in our data?


In [None]:
# get rid of anomaly


In [60]:
# exercise - get a list of car name where the mpg is less than 18 and weight is greater than 3500

# there should be 91 cars

df[(df['mpg']<18)&(df['weight']>3500)]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
...,...,...,...,...,...,...,...,...,...
285,17.0,8,305.0,130,3840,15.4,79,1,chevrolet caprice classic
286,17.6,8,302.0,129,3725,13.4,79,1,ford ltd landau
287,16.5,8,351.0,138,3955,13.2,79,1,mercury grand marquis
289,16.9,8,350.0,155,4360,14.9,79,1,buick estate wagon (sw)
