# Introduction to Pandas

Pandas is a high-level data manipulation package which was built on top of Numpy. The key structures within pandas include Series and Dataframes

## Series

A Series is a one-dimensional array with axis labels (an index)

In [2]:
# Importing libraries and packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# Creating a Series from a list
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
# We can access different components seperately:

# Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# Accessing values
x.values

array([10, 20, 30, 40, 50])

In [6]:
# Acccessing the dtype
# A Series is a ndarray, thus it is homogenous and CANNOT store multiple dtypes
x.dtype

dtype('int64')

In [7]:
# Creating a Series with an Index
data = [450, 650, 870]
sales = Series(data, index = ["Don", "Mike", "Edwin"])
sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [8]:
# Check the type
type(sales)

pandas.core.series.Series

In [9]:
# If we check the index of sales, we will get the values, rather than the range
sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

In [10]:
sales["Don"]

np.int64(450)

In [11]:
sales[0]

  sales[0]


np.int64(450)

### Checking for conditions

In [12]:
# You can filter based on conditions
sales>500
# This will usually return booleans

Don      False
Mike      True
Edwin     True
dtype: bool

In [13]:
# We can use these booleans
sales[[False, True, True]]

Mike     650
Edwin    870
dtype: int64

In [14]:
# If we want to see values greater than 500, we can use those booleans
sales[sales>500]

Mike     650
Edwin    870
dtype: int64

In [15]:
# Checking the names in the index
"Don" in sales

True

In [16]:
# False example
"Sally" in sales

False

In [17]:
# What about this
450 in sales
# 450 is not an index, it is a value

False

## Working with Dictionaries

In [18]:
# Converting a Series to a Dictionary
sales_dict = sales.to_dict()
sales_dict
# The indexes become the dictionary keys

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [19]:
# Converting a dictionary to a Series
sales_ser = Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

## Adding entries and working with Nan/null values

In [20]:
# We can create a new Series from an existing Series
# If we specify names in the index that were NOT there already, NaN values will be assigned
new_sales = Series(sales, index = ["Don", "Mike", "Sally", "Edwin", "Lucy"])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [21]:
# We can check if there are any NaN values in a Series
# For this we can use Numpy
np.isnan(new_sales)
# Shows True for any NaN values

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [22]:
# To check for null values, use Pandas
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

## Naming components in a Series

In [23]:
# Name an index
sales.index.name = "Sales Person"
sales

Sales Person
Don      450
Mike     650
Edwin    870
dtype: int64

In [24]:
# Name a Series
sales.name = "Total TV Sales"
sales

Sales Person
Don      450
Mike     650
Edwin    870
Name: Total TV Sales, dtype: int64

# DataFrames

DataFrame are two-dimensional, size-mutable, potentially heterogeneous tabular data structures. This data structure contains TWO labelled axes (rows and columns)

## Creating a DataFrame

In [25]:
# Creating a DataFrame from a list
data = [["Adrian", 20], ["Bethany", 23], ["Chloe", 41]]

# When we create a DataFrame, we can specify what the column names are and the data type is
df = pd.DataFrame(data, columns = ["Name", "Age"])
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


## Creating a DataFrame from a Dictionary

In [27]:
# Create an example dictionary
dictionary = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

# Can use pd.DataFrame and pandas knows to use the keys as columns and the values as entries
df_dict = pd.DataFrame(dictionary)
df_dict

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


## Adding custom indexes for DataFrames

In [31]:
# Adding custom indexes can be done via the index parameter in the DataFrame method

# It can be done from DataFrame creation:
df_dict = pd.DataFrame(dictionary, index = ['ID1', 'ID2', 'ID3'])
print(df_dict)

# Or it can be done by using the .index method
df_dict.index = ['id1', 'id2', 'id3']
print(df_dict)

        Name  Age         City
ID1    Alice   25     New York
ID2      Bob   30  Los Angeles
ID3  Charlie   35      Chicago
        Name  Age         City
id1    Alice   25     New York
id2      Bob   30  Los Angeles
id3  Charlie   35      Chicago


## Creating DataFrames from a list of Dictionaries

In [32]:
# Using the same data as before to create a list of dictionaries
dict_list = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'Los Angeles'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Chicago'}
]
# You can create a DataFrame by simply putting this through pd.DataFrame
dict_list_df = pd.DataFrame(dict_list)
dict_list_df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


## Creating DataFrames from a Series

In [48]:
# Creating a DataFrame from a Series is simple
# First create a Series
s = pd.Series(['Alice', 'Bob', 'Charlie'], name='Name')

# Then create the DataFrame
df_series = pd.DataFrame(s)
df_series

# This returns a single column. A Series is basically a single column DataFrame

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie


## Adding a Series to a DataFrame

In [49]:
# Adding a Series to a DataFrame is equivalent to adding a column

new_col = pd.Series([25, 30, 35], name = "Age")

# We can now add this Series to the DataFrame
df_series["Age"] = new_col
df_series

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


## Changing/Shifting the index of a DataFrame

In [50]:
# To change the index to a column one can use:
df_series = df_series.set_index("Name")
df_series
# This changes the index to the Name column

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Alice,25
Bob,30
Charlie,35


In [51]:
# You can also shift the index a number of steps
df_series = df_series.shift(1)
df_series
# This shifted the index downwards by 1, making the top row have a NaN entry

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Alice,
Bob,25.0
Charlie,30.0


## How to fill missing values in a DataFrame?

In [57]:
# You can fill all NaN values with a specific value
missing = {
    'Name': ['Alice', 'Bob', 'Charlie', None],
    'Age': [25, 30, None, 40],
    'City': ['New York', None, 'Chicago', 'San Francisco']
}

df_missing = pd.DataFrame(missing)

df_filled = df_missing.fillna("Unknown")

df_filled

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Unknown
2,Charlie,Unknown,Chicago
3,Unknown,40.0,San Francisco


In [59]:
# You can also fill in missing values using the value from the previous or next row
df_filled = df_missing.ffill() # Fills the missing values with the previous row's value
df_filled

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,New York
2,Charlie,30.0,Chicago
3,Charlie,40.0,San Francisco


In [60]:
df_filled = df_missing.bfill() # Fills the missing values with the next row's value
df_filled

Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Chicago
2,Charlie,40.0,Chicago
3,,40.0,San Francisco


In [None]:
e