# Intro to Pandas
Pandas is a high-level data manipulation package which is built on top of Numpy. Key structures include Series and Dataframes.

## Series
A series is a 1d array with access labels (an index).

In [3]:
# ensure to import!
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [5]:
# creating a series from a list
x=pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [6]:
# accessing different components:
# access index
x.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
# access values
x.values

array([10, 20, 30, 40, 50])

In [9]:
# access data type
# a series is an ndarray, thus can only have one data type!

In [10]:
# create a series with indicies
data=[450,650,720]
Sales=Series(data, index=["a","b","c"])
Sales

a    450
b    650
c    720
dtype: int64

In [11]:
# check the type
type(Sales)

pandas.core.series.Series

In [12]:
#lets look at the indicies
Sales.index

Index(['a', 'b', 'c'], dtype='object')

### Accessing Values


In [13]:
# access values using index name
Sales["b"]

np.int64(650)

### Checking for Conditions

In [16]:
# filter by condition, usualy return booleans
Sales>500

a    False
b     True
c     True
dtype: bool

In [17]:
# use booleans 
Sales[[False,True,True]]

b    650
c    720
dtype: int64

In [18]:
# if we want to see >500, can use these booleans
Sales[Sales>500]

b    650
c    720
dtype: int64

In [19]:
# check if name in index
"c" in Sales

True

In [21]:
450 in Sales
# 450 is not index, its value. Thus false

False

### Working with Dictionaries

In [22]:
# converting series to dictionary
sales_dict=Sales.to_dict()
sales_dict

{'a': 450, 'b': 650, 'c': 720}

In [23]:
# converting to dictionary to series
sales_ser=Series(sales_dict)
sales_ser

a    450
b    650
c    720
dtype: int64

### Adding entries and working with null values

In [25]:
# create a new series from existing
# if we specify in the index that were NOT already, NaN values assigned
new_sales=Series(Sales,index=["a","c","f","z"])
new_sales

a    450.0
c    720.0
f      NaN
z      NaN
dtype: float64

In [36]:
# add values by concat
newnew_sales=Series([120],index=["h"])
nsales=pd.concat([new_sales,newnew_sales])
nsales

a    450.0
c    720.0
f      NaN
z      NaN
h    120.0
dtype: float64

In [26]:
# check if any NaN values in a Series
# USE NUMPY
np.isnan(new_sales)

a    False
c    False
f     True
z     True
dtype: bool

In [28]:
# check for null values
# USE PANDAS
pd.isnull(new_sales)

a    False
c    False
f     True
z     True
dtype: bool

### Naming components


In [30]:
# name an index
Sales.index.name="Sales Person"
Sales

Sales Person
a    450
b    650
c    720
dtype: int64

In [31]:
# name a series
Sales.name="Total sales"
Sales

Sales Person
a    450
b    650
c    720
Name: Total sales, dtype: int64

## Data Frames
Data frames are 2d, size-mutable, potentionally heterogeneous tabular data structures. This data structure contains TWO labeled axes.

In [34]:
# create a DataFrame form list
data=[["a",20],["b",23],["c",41]]
# when a DataFrame is created, we can specify column names and data types
df=pd.DataFrame(data,columns=["Name","Age"])
df

Unnamed: 0,Name,Age
0,a,20
1,b,23
2,c,41
