# Introduction to Pandas

Pandas is a high level data manipulation package which was bulit on top of numpy. The key structures within pandas include series and dataframes.

## Series 

A series is a one dimensional array with axis labels(an index).

In [2]:
!pip install pandas



In [1]:
#importing libraries and packages
import numpy as np
import pandas as pd

In [2]:
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
#accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# Accessing values
x.values

array([10, 20, 30, 40, 50], dtype=int64)

In [6]:
#accessing the dtype
# A series is an ndarray, its homogeneous and cannot store multiple dtypes
x.dtype

dtype('int64')

In [12]:
#creating a series with an index
data = [450, 650, 870]
sales = pd.Series(data, index=['Don', 'Mike', 'Edwin'])
sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [13]:
# check the type 
type(sales)

pandas.core.series.Series

In [14]:
# check index of sales
sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

### Accessing Values

In [15]:
# you can access values using the index name
sales['Don']

450

In [16]:
sales[0]

  sales[0]


450

## Checking for conditions

In [17]:
#filter based on conditions
sales>500

Don      False
Mike      True
Edwin     True
dtype: bool

In [18]:
# we can use these booleans
sales[[False, True, True]]

Mike     650
Edwin    870
dtype: int64

In [19]:
#values greate than 500
sales[sales>500]

Mike     650
Edwin    870
dtype: int64

In [20]:
'Don' in sales

True

In [22]:
'Sally' in sales

False

## Working with dictionaries

In [23]:
#converting a series to dictionary
sales_dict = sales.to_dict()
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [25]:
#converting a dict to a series
sales_ser =pd.Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

## Adding entries and working with NaN/null values

In [26]:
# can create a new series from an existing series
#if we specify names in the index
new_sales=pd.Series(sales, index=['Don', 'Mike', 'Sally', 'Edwin', 'Lucy'])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

In [27]:
#check if any NaN values in the series
np.isnan(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

In [28]:
#check for null values
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

## Naming components in a series

In [29]:
#name an index
sales.index.name = 'sales person'
sales

sales person
Don      450
Mike     650
Edwin    870
dtype: int64

In [30]:
#naming a series
sales.name = 'total tv sales'
sales

sales person
Don      450
Mike     650
Edwin    870
Name: total tv sales, dtype: int64

## DataFrames

Dataframes are two-dimensional, size mutable, potentially heterogeneous tabular data structures. This data structure contains TWO labelled axes (rows and coloumns).

### Creating a DataFrame

In [129]:
#Creating a dataframe from a list

data = [["Adrian" , 20], ["Bethany", 23], ["Chloe", 41]]

#to create a dataframe, specify what column names and the data type is 
df = pd.DataFrame(data, columns=["Name", "Age"])
df

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


# Pandas Research/Practice

In [130]:
# Creating a DataFrame from a dictionary
data_dict = {"Name": ["Adrian", "Bethany", "Chloe"], "Age": [20, 23, 41]}
df_dict = pd.DataFrame(data_dict)
df_dict

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [131]:
#adding custom indexes
df_dict.index = ["A", "B", "C"]
df_dict

Unnamed: 0,Name,Age
A,Adrian,20
B,Bethany,23
C,Chloe,41


In [154]:
# Creating a DF from a list of dictionaries
data_list_dicts = [{"Name": "Adrian", "Age": 20}, {"Name": "Bethany", "Age": 23}, {"Name": "Chloe", "Age": 41}]
df_list_dicts = pd.DataFrame(data_list_dicts) # Each dictionary represents a row in the DataFrame
df_list_dicts

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [133]:
# Creating a DF from a Series
names = pd.Series(["Adrian", "Bethany", "Chloe"], name="Name")
ages = pd.Series([20, 23, 41], name="Age")
df_series = pd.DataFrame({"Name": names, "Age": ages})
df_series

Unnamed: 0,Name,Age
0,Adrian,20
1,Bethany,23
2,Chloe,41


In [134]:
#adding a series to an existing DF
df_series["Gender"] = pd.Series(["Male", "Female", "Female"])
df_series #as you can see it has added a new column called Gender to the existing DF

Unnamed: 0,Name,Age,Gender
0,Adrian,20,Male
1,Bethany,23,Female
2,Chloe,41,Female


In [135]:
#shifting/changing a DFs index
df_series = df_series.set_index("Name")
df_series

Unnamed: 0_level_0,Age,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Adrian,20,Male
Bethany,23,Female
Chloe,41,Female


In [155]:
#missing values, want to change all NaN values to 0
df_filled = df_missing.fillna(0)
df_filled

Unnamed: 0,A,B
0,1.0,0.0
1,2.0,3.0
2,0.0,4.0


### Backfill - replaces NaN values with the next valid data point in the column
### Pad - fills NaN values with the previous valid data point in the column

In [156]:
df_filled_diff = df_missing.fillna({"A": 100, "B": 200})
df_filled_diff

Unnamed: 0,A,B
0,1.0,200.0
1,2.0,3.0
2,100.0,4.0


In [157]:
# backfill (bfill) and forward fill (ffill)
df_bfill = df_missing.bfill()
df_ffill = df_missing.ffill()
df_bfill
df_ffill


Unnamed: 0,A,B
0,1.0,
1,2.0,3.0
2,2.0,4.0


### .interpolate() in pandas is used to fill missing values NaNs in a DataFrame or Series by using interpolation

In [158]:
df_interp = df_missing.interpolate()
df_interp

Unnamed: 0,A,B
0,1.0,
1,2.0,3.0
2,2.0,4.0


In [160]:
# Dropping values
df_drop_cols = df_dict.drop(columns=["Age"])
df_drop_cols

Unnamed: 0,Name,Rank
A,Adrian,3.0
B,Bethany,2.0
C,Chloe,1.0


In [161]:
# Dropping based on a threshold
df_thresh = df_missing.dropna(thresh=1)
df_thresh

Unnamed: 0,A,B
0,1.0,
1,2.0,3.0
2,,4.0


In [162]:
# Dropping based on an index
df_drop_index = df_dict.drop(index="B")
df_drop_index # it has removed the 2 row 

Unnamed: 0,Name,Age,Rank
A,Adrian,20,3.0
C,Chloe,41,1.0


In [163]:
# Removing duplicate rows
df_dupes = pd.DataFrame({"A": [1, 1, 2], "B": [3, 3, 4]})
df_no_dupes = df_dupes.drop_duplicates()
df_no_dupes

Unnamed: 0,A,B
0,1,3
2,2,4


In [164]:
# Finding duplicate rows
duplicates = df_dupes.duplicated()
duplicates

0    False
1     True
2    False
dtype: bool

In [145]:
# Selecting an entire column
ages = df_dict["Age"]
ages # it has just outputted the age column

A    20
B    23
C    41
Name: Age, dtype: int64

## iloc lets you pick out data based on its integer position the example chosen was the first row

In [167]:
# Using iloc
df_iloc = df_dict.iloc[0]  # First row
df_iloc

Name    Adrian
Age         20
Rank       3.0
Name: A, dtype: object

In [168]:
# Using loc
df_loc = df_dict.loc["A"]  # Selecting row "A"
df_loc

Name    Adrian
Age         20
Rank       3.0
Name: A, dtype: object

In [169]:
# Filtering data
filtered_df = df_dict[df_dict["Age"] > 21]
filtered_df
#has only selected ages over 21

Unnamed: 0,Name,Age,Rank
B,Bethany,23,2.0
C,Chloe,41,1.0


In [170]:
# Sorting
df_sorted = df_dict.sort_values(by="Age", ascending=False)
df_sorted #This has chosen to sort the ages in descending order with Chloe being the oldest

Unnamed: 0,Name,Age,Rank
C,Chloe,41,1.0
B,Bethany,23,2.0
A,Adrian,20,3.0


In [171]:
# Ranking
df_dict["Rank"] = df_dict["Age"].rank(ascending=False)
print(df_dict) #as the ages were in descending order Chloe was ranked a 1 as it comes first in descending order

      Name  Age  Rank
A   Adrian   20   3.0
B  Bethany   23   2.0
C    Chloe   41   1.0


## describe() method in pandas is used to generate descriptive statistics of DataFrame columns


In [172]:
# describe()
descriptive_stats = df_dict.describe()
descriptive_stats

Unnamed: 0,Age,Rank
count,3.0,3.0
mean,28.0,2.0
std,11.357817,1.0
min,20.0,1.0
25%,21.5,1.5
50%,23.0,2.0
75%,32.0,2.5
max,41.0,3.0


In [152]:
# Other summary statistics - .mean()
mean_age = df_dict["Age"].mean()
mean_age


28.0

## Index hierarchy allows you to have multiple levels of indexing in rows or columns

In [173]:
# Creating MultiIndex
index = pd.MultiIndex.from_tuples([
    ('Alice', 'Math'), ('Alice', 'Science'),
    ('Bob', 'Math'), ('Bob', 'Science')
], names=['Student', 'Subject'])

# Creating DataFrame
df = pd.DataFrame({
    'Term 1': [85, 80, 78, 75],
    'Term 2': [90, 88, 82, 79]
}, index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Term 1,Term 2
Student,Subject,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,Math,85,90
Alice,Science,80,88
Bob,Math,78,82
Bob,Science,75,79


In [None]:
#another example
