# Intro to Pandas
Pandas is a high-level data manipulation package which is built on top of Numpy. Key structures include Series and Dataframes.

## Series
A series is a 1d array with access labels (an index).

In [104]:
# ensure to import!
import numpy as np
import pandas as pd
#from pandas import Series, DataFrame

In [5]:
# creating a series from a list
x=pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [6]:
# accessing different components:
# access index
x.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
# access values
x.values

array([10, 20, 30, 40, 50])

In [9]:
# access data type
# a series is an ndarray, thus can only have one data type!

In [10]:
# create a series with indicies
data=[450,650,720]
Sales=Series(data, index=["a","b","c"])
Sales

a    450
b    650
c    720
dtype: int64

In [11]:
# check the type
type(Sales)

pandas.core.series.Series

In [12]:
#lets look at the indicies
Sales.index

Index(['a', 'b', 'c'], dtype='object')

### Accessing Values


In [13]:
# access values using index name
Sales["b"]

np.int64(650)

### Checking for Conditions

In [16]:
# filter by condition, usualy return booleans
Sales>500

a    False
b     True
c     True
dtype: bool

In [17]:
# use booleans 
Sales[[False,True,True]]

b    650
c    720
dtype: int64

In [18]:
# if we want to see >500, can use these booleans
Sales[Sales>500]

b    650
c    720
dtype: int64

In [19]:
# check if name in index
"c" in Sales

True

In [21]:
450 in Sales
# 450 is not index, its value. Thus false

False

### Working with Dictionaries

In [22]:
# converting series to dictionary
sales_dict=Sales.to_dict()
sales_dict

{'a': 450, 'b': 650, 'c': 720}

In [23]:
# converting to dictionary to series
sales_ser=Series(sales_dict)
sales_ser

a    450
b    650
c    720
dtype: int64

### Adding entries and working with null values

In [25]:
# create a new series from existing
# if we specify in the index that were NOT already, NaN values assigned
new_sales=Series(Sales,index=["a","c","f","z"])
new_sales

a    450.0
c    720.0
f      NaN
z      NaN
dtype: float64

In [41]:
# add values by concat
newnew_sales=Series([120],index=["h"])
nsales=pd.concat([new_sales,newnew_sales])
nsales

a    450.0
c    720.0
f      NaN
z      NaN
h    120.0
dtype: float64

In [49]:
# delete values
nsales.pop("a")
nsales

c    720.0
f      NaN
z      NaN
h    120.0
dtype: float64

In [45]:
# check if any NaN values in a Series
# USE NUMPY
np.isnan(new_sales)

a    False
c    False
f     True
z     True
dtype: bool

In [28]:
# check for null values
# USE PANDAS
pd.isnull(new_sales)

a    False
c    False
f     True
z     True
dtype: bool

### Naming components


In [30]:
# name an index
Sales.index.name="Sales Person"
Sales

Sales Person
a    450
b    650
c    720
dtype: int64

In [31]:
# name a series
Sales.name="Total sales"
Sales

Sales Person
a    450
b    650
c    720
Name: Total sales, dtype: int64

## Data Frames
Data frames are 2d, size-mutable, potentionally heterogeneous tabular data structures. This data structure contains TWO labeled axes.

### Creating a Dataframe

In [62]:
# create a DataFrame from list
data=[["a",20],["b",23],["c",41]] #this is each row
# when a DataFrame is created, we can specify column names and data types
df=pd.DataFrame(data,columns=["Name","Age"])
df

Unnamed: 0,Name,Age
0,a,20
1,b,23
2,c,41


In [63]:
# create a Dataframe from dictionary
data={'Name':['a','b','c'],'Age':[20,23,41]} #this is each column
df=pd.DataFrame.from_dict(data)
df

Unnamed: 0,Name,Age
0,a,20
1,b,23
2,c,41


In [56]:
# create a DataFrame from a list of dictionaries:
data=[{'Name':"a",'Age':20},{'Name':"b",'Age':23},{'Name':"c",'Age':41}]
df=pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,a,20
1,b,23
2,c,41


In [64]:
# create a DataFrame from Series, put the series into a dictionary:
age=pd.Series([20,23,41],name="age")
name=pd.Series(["a","b","c"],name="Name")
dict={"Name":name,"Age":age}
df=pd.DataFrame(dict)
df

Unnamed: 0,Name,Age
0,a,20
1,b,23
2,c,41


### Adding custom indicies

In [61]:
# if adding indicies when inicialising a Data Frame, use index=:
data=data=[["a",20],["b",23],["c",41]]
df=pd.DataFrame(data,columns=["Name","Age"],index=([10,20,30]))
df

Unnamed: 0,Name,Age
10,a,20
20,b,23
30,c,41


In [55]:
# if adding to a complete Data Frame, use .index=
df.index=["a","b","c"]
df

Unnamed: 0,Name,Age
a,a,20
b,b,23
c,c,41


### Adding data to existing dataframe

In [119]:
# adding a series as a column, use .concat:
data=[["a",20],["b",23],["c",41]]
df=pd.DataFrame(data,columns=["Name","Age"])
result=pd.Series(["Pass","Pass","Fail"])
df=pd.concat([df,result],axis=1)
df

Unnamed: 0,Name,Age,0
0,a,20,Pass
1,b,23,Pass
2,c,41,Fail


### Setting an index
Sometimes we want to use a specific column or array for the index.


In [117]:
# putting a column as an index:
data=[["a",20],["b",23],["c",41]]
df=pd.DataFrame(data,columns=["Name","Age"])
df=df.set_index("Name")
df

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
a,20
b,23
c,41


In [125]:
# introducing a new index:
df=df.set_index([pd.Index([2, 3, 7])])
df

Unnamed: 0,Name,Age
2,a,20
3,b,23
7,c,41


In [128]:
# set an functional index
# putting a column as an index:
data=[["a",20],["b",23],["c",41]]
s = pd.Series([1, 2, 3])
df=pd.DataFrame(data,columns=["Name","Age"])
df=df.set_index([s**2])
df

Unnamed: 0,Name,Age
1,a,20
4,b,23
9,c,41


### Filling in missing data


In [129]:
# fill any NaN with specific number, use .fillna
dict = {'First Score': [100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score': [np.nan, 40, 80, 98]}
df = pd.DataFrame(dict)
df.fillna(0)

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


In [131]:
# fill any NaN with one above, use .fillna(method='ffill') [below is 'bfill']
dict = {'First Score': [100, 90, np.nan, 95],
        'Second Score': [30, 45, 56, np.nan],
        'Third Score': [np.nan, 40, 80, 98]}
df = pd.DataFrame(dict)
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,90.0,56.0,80.0
3,95.0,56.0,98.0


In [133]:
# we can also interpolate any missing data: ie educated guess
df = pd.DataFrame({ 
    'A': [1, 2, np.nan, 4], 
    'B': [5, np.nan, np.nan, 8], 
    'C': [9, 10, 11, 12] 
})
df.interpolate()

Unnamed: 0,A,B,C
0,1.0,5.0,9
1,2.0,6.0,10
2,3.0,7.0,11
3,4.0,8.0,12


### Dropping from a dataframe

In [139]:
# dropping an entire row, use .drop(index)
data = {
  "name": ["Sally", "Mary", "John"],
  "age": [50, 40, 30],
  "qualified": [True, False, False]
}
df = pd.DataFrame(data)
df.drop(0)

Unnamed: 0,name,age,qualified
1,Mary,40,False
2,John,30,False


In [138]:
# dropping an entire column, ensure to add axis='columns'
data = {
  "name": ["Sally", "Mary", "John"],
  "age": [50, 40, 30],
  "qualified": [True, False, False]
}
df = pd.DataFrame(data)
df.drop("age", axis='columns')

Unnamed: 0,name,qualified
0,Sally,True
1,Mary,False
2,John,False


In [141]:
# dropping given a condition, we will use a boolean mask:
data = {
  "name": ["Sally", "Mary", "John"],
  "age": [50, 40, 30],
  "qualified": [True, False, False]
}
df = pd.DataFrame(data)
mask=df['qualified']==False # selected all ones where qualified is false
df[~mask] #selects all in dataframe that arent in the mask

Unnamed: 0,name,age,qualified
0,Sally,50,True


### Dealing with duplication
There are times where duplicates need to either be dropped or found.

In [143]:
# dropping any duplicates, use .drop_duplicates(subset=['column']):
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'David'],
    'Age': [25, 30, 25, 40],
    'City': ['NY', 'LA', 'SF', 'Chicago']
})
df.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Age,City
0,Alice,25,NY
1,Bob,30,LA
3,David,40,Chicago


In [145]:
# finding duplicates, use .duplicated
df.duplicated(subset=["Name"])

0    False
1    False
2     True
3    False
dtype: bool

### Selecting data in a dataframe


In [146]:
# to look in a specific column
df['Name']

0    Alice
1      Bob
2    Alice
3    David
Name: Name, dtype: object

In [None]:
# look for a 