# Introduction to Pandas

Pandas is a high-level data manipulation package which was built on top of Numpy. The key structures within pandas include Series and Dataframes

## Series

A Series is a one-dimensional array with axis labels (an index)

In [3]:
# Importing libraries and packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [5]:
# Creating a Series from a list
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [6]:
# We can access different components seperately:

# Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
# Accessing values
x.values

array([10, 20, 30, 40, 50])

In [9]:
# Acccessing the dtype
# A Series is a ndarray, thus it is homogenous and CANNOT store multiple dtypes
x.dtype

dtype('int64')

In [10]:
# Creating a Series with an Index
data = [450, 650, 870]
sales = Series(data, index = ["Don", "Mike", "Edwin"])
sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [11]:
# Check the type
type(sales)

pandas.core.series.Series

In [12]:
# If we check the index of sales, we will get the values, rather than the range
sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

In [14]:
sales["Don"]

np.int64(450)

In [15]:
sales[0]

  sales[0]


np.int64(450)

### Checking for conditions

In [16]:
# You can filter based on conditions
sales>500
# This will usually return booleans

Don      False
Mike      True
Edwin     True
dtype: bool

In [17]:
# We can use these booleans
sales[[False, True, True]]

Mike     650
Edwin    870
dtype: int64

In [19]:
# If we want to see values greater than 500, we can use those booleans
sales[sales>500]

Mike     650
Edwin    870
dtype: int64

In [20]:
# Checking the names in the index
"Don" in sales

True

In [21]:
# False example
"Sally" in sales

False

In [23]:
# What about this
450 in sales
# 450 is not an index, it is a value

False

## Working with Dictionaries

In [25]:
# Converting a Series to a Dictionary
sales_dict = sales.to_dict()
sales_dict
# The indexes become the dictionary keys

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [26]:
# Converting a dictionary to a Series
sales_ser = Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

## Adding entries and working with Nan/null values

In [28]:
# We can create a new Series from an existing Series
# If we specify names in the index that were NOT there already, NaN values will be assigned
new_sales = Series(sales, index = ["Don", "Mike", "Sally", "Edwin", "Lucy"])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [30]:
# We can check if there are any NaN values in a Series
# For this we can use Numpy
np.isnan(new_sales)
# Shows True for any NaN values

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [31]:
# To check for null values, use Pandas
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

## Naming components in a Series

In [33]:
# Name an index
sales.index.name = "Sales Person"
sales

Sales Person
Don      450
Mike     650
Edwin    870
dtype: int64

In [34]:
# Name a Series
sales.name = "Total TV Sales"
sales

Sales Person
Don      450
Mike     650
Edwin    870
Name: Total TV Sales, dtype: int64

# DataFrames

DataFrame are two-dimensional, size-mutable, potentially heterogeneous tabular data structures. This data structure contains TWO labelled axes (rows and columns)

## Creating a DataFrame

In [38]:
# Creating a DataFrame from a list
data = [["Adrian", 20], ["Bethany", 23], ["Chloe", 41]]

# When we create a DataFrame, we can specify what the column names are and the data type is
df = pd.DataFrame(data, columns = ["Name", "Age"])
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41
