# Introduction To Pandas




Be sure you have `Pandas` installed on your local computer

In [None]:
# Uncomment the command below and run this cell to install pandas (if you do not have pandas installed yet)
# ! pip install pandas

## Importing the `Pandas` package

Like every other Python Package, we import pandas with<br>
`import pandas as pd`

==> `pd` is the generally acceptable alias for pandas

In [1]:
import pandas as pd

## Series

`Series` in Pandas is a one-dimentional array<br>
It can hold any datatype<br>

In [2]:
car_details = {'Name': 'Toyota Tundra', 'Price': 45000, 'Year': 2023}
series1 = pd.Series(car_details)

In [3]:
series1

Name     Toyota Tundra
Price            45000
Year              2023
dtype: object

In [4]:
testresults = {'Advanced Mathematics': 60, 'Data Structures': 45, 'Web Development': 52, 'Discrete Mathematics': 62}

In [5]:
# Quiz 1: Create a Pandas Series from the above dictionary

testresults = pd.Series(testresults
) # replace the None datatype with your solution

In [6]:
testresults = pd.Series(testresults)

In [7]:
# run this cell to see your output
print(testresults)

Advanced Mathematics    60
Data Structures         45
Web Development         52
Discrete Mathematics    62
dtype: int64


In [8]:
# sort the Series by the index (subjects)

testresults.sort_index()

Advanced Mathematics    60
Data Structures         45
Discrete Mathematics    62
Web Development         52
dtype: int64

In [9]:
# sort the series by the values (scores)

testresults.sort_values()

Data Structures         45
Web Development         52
Advanced Mathematics    60
Discrete Mathematics    62
dtype: int64

In [10]:
# count how many rows there are in the Series

testresults.count()

4

In [11]:
# obtain the maximum value in the series

testresults.max()

62

In [12]:
testresults.shape

(4,)

In [13]:
# obtain the maximum index and value (filter)

testresults[testresults == testresults.max()]

Discrete Mathematics    62
dtype: int64

In [14]:
testresults[testresults== testresults.min()]

Data Structures    45
dtype: int64

In [19]:
# Quiz 2: Get the minimum index and value (use the .min() method)

min_indices = testresults.idxmin()
min_rows = testresults.loc[min_indices]
print(min_rows)

45


In [25]:
# perform calculations on all of the values in the Series

percent_tst = testresults / 100

In [26]:
# convert the series to a dictionary

testresults.to_dict()

{'Advanced Mathematics': 60,
 'Data Structures': 45,
 'Web Development': 52,
 'Discrete Mathematics': 62}

## DataFrames

In [20]:
accommodation_venues = {
    'Name': ['Leisure Hotels & Suits', 'Reboot Guest House', 'Dove House', 'Live Rest Hotels', 'Khira\'s Place']
    , 'Accomodation Type': ['Hotel', 'Guest House', 'Guest House', 'Hotel', 'Motel']
    , 'Price Per Night': [20000, 14000, 18800, 32500, 3000]
}

In [21]:
# turn the dictionary to a pandas dataframe

accommodation_venues = pd.DataFrame(accommodation_venues)

In [23]:
accommodation_venues

Unnamed: 0,Name,Accomodation Type,Price Per Night
0,Leisure Hotels & Suits,Hotel,20000
1,Reboot Guest House,Guest House,14000
2,Dove House,Guest House,18800
3,Live Rest Hotels,Hotel,32500
4,Khira's Place,Motel,3000


In [22]:
# view details about the dataframe

accommodation_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               5 non-null      object
 1   Accomodation Type  5 non-null      object
 2   Price Per Night    5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


TypeError: 'tuple' object is not callable

In [27]:
accommodation_venues.describe()

Unnamed: 0,Price Per Night
count,5.0
mean,17660.0
std,10669.020574
min,3000.0
25%,14000.0
50%,18800.0
75%,20000.0
max,32500.0


In [24]:
# view the number of rows and columns in the dataframe


accommodation_venues.shape

(5, 3)

In [None]:
# view the content of the dataframe

accommodation_venues

In [29]:
# sort the dataframe by the Names of the venues in descending order

accommodation_venues.sort_values(by='Name', ascending=False)

Unnamed: 0,Name,Accomodation Type,Price Per Night
1,Reboot Guest House,Guest House,14000
3,Live Rest Hotels,Hotel,32500
0,Leisure Hotels & Suits,Hotel,20000
4,Khira's Place,Motel,3000
2,Dove House,Guest House,18800


## Series - From a Data Frame
A Series can be obtained from a dataframe in the sme manner as you would access a key in a Python Dictionary


In [30]:
# get only the 'Names' column (Series)

accommodation_venues['Name']

0    Leisure Hotels & Suits
1        Reboot Guest House
2                Dove House
3          Live Rest Hotels
4             Khira's Place
Name: Name, dtype: object

In [33]:
accommodation_venues[['Accomodation Type', 'Price Per Night']]

Unnamed: 0,Accomodation Type,Price Per Night
0,Hotel,20000
1,Guest House,14000
2,Guest House,18800
3,Hotel,32500
4,Motel,3000


In [31]:
# sort the values in the series

accommodation_venues['Name'].sort_values(ascending=True)

2                Dove House
4             Khira's Place
0    Leisure Hotels & Suits
3          Live Rest Hotels
1        Reboot Guest House
Name: Name, dtype: object

In [32]:
# slicing the Series

accomodateion_names = accommodation_venues['Name']

print(accomodateion_names)

0    Leisure Hotels & Suits
1        Reboot Guest House
2                Dove House
3          Live Rest Hotels
4             Khira's Place
Name: Name, dtype: object


In [None]:
# how many rows are there in the series

accomodateion_names.size 

## Selecting DataFrame Subsets

In [None]:
# Select only the names and the prices columns

accommodation_venues[['Name', 'Price Per Night']]

## Filtering DataFrames

In [None]:
# Get accommodation venues greater than 10000

accommodation_venues[accommodation_venues['Price Per Night'] > 10000]

In [34]:
# Quiz 3: Get the list of accomodation venues less than 15000 per night
av_filter = accommodation_venues['Price Per Night'] < 15000
accommodation_venues[av_filter]


Unnamed: 0,Name,Accomodation Type,Price Per Night
1,Reboot Guest House,Guest House,14000
4,Khira's Place,Motel,3000


In [36]:
# Quiz 4: What are the accommodation type and price per night of Reboot Guest House

accommodation_venues[accommodation_venues['Name'] == 'Reboot Guest House']

Unnamed: 0,Name,Accomodation Type,Price Per Night
1,Reboot Guest House,Guest House,14000


### Noting the Datatypes of your return values

In [39]:
 # selecting a single column (Python dictionary-key style) will return a Series datatype


type(accommodation_venues['Price Per Night'])

pandas.core.series.Series

In [None]:
# selecting more than one column, with the column names provided in a list returns a dataframe datatype

type(accommodation_venues[['Name', 'Price Per Night']])

## More on Filters - Conditions

To filter a dataset in Pandas with more than one condition, you make use of the pipe `|` and ampersand `&` symbols<br>

`|`      - the pipe symbol is equivalent to the `or` statement in Python<br>
`&`      - the ampersand symbol is equivalent to the `and` statement in Python<br>
`isin`   - the `.isin()` method takes in an iterable of optionsand returns the dataset of specified series containing those values

In [40]:
# Get a list of Guest Houses and Hotels only

accommodation_venues[accommodation_venues['Accomodation Type'].isin(('Guest House', 'Hotel'))]

Unnamed: 0,Name,Accomodation Type,Price Per Night
0,Leisure Hotels & Suits,Hotel,20000
1,Reboot Guest House,Guest House,14000
2,Dove House,Guest House,18800
3,Live Rest Hotels,Hotel,32500


In [None]:
# Get a list of venues that are either Guest Houses or have their Price per Night less than 10000

accommodation_venues[(accommodation_venues['Accomodation Type'] == 'Guest House') | (accommodation_venues['Price Per Night'] < 10000)]

In [43]:
# Quiz 5: Get a list of Guest Houses with prices above 15000
a = (accommodation_venues['Accomodation Type'] == 'Guest House') & (accommodation_venues['Price Per Night'] > 15000)
filtered_accommodation_venues = accommodation_venues[a]


## Exercises
Create a datafram from the dictionary defined below and answer the questions following

In [44]:
shoes = {
    'Owner': ['Lola', 'Bosun', 'Gideon', 'Eben', 'Patrick', 'Aminu', 'Rita', 'Nick', 'Dayo', 'Musa', 'Joy']
    , 'Size': [36.5, 45, 43.5, 41, 45, 43, 41, 42.5, 46.5, 39, 42]
    , 'Price': [6000, 10000, 12000, 5500, 6850, 7920, 2300, 4200, 7600, 4000, 3650]
    , 'Brand': ['Nike', 'Reebok', 'Converse', 'Puma', 'Gucci', 'Nike', 'Nike', 'Puma', 'Puma', 'Converse', 'Reebok']
}

In [45]:
# Q1: Create a dataframe of the shoes dictionary here

shoes_df = pd.DataFrame(shoes)
shoes_df

Unnamed: 0,Owner,Size,Price,Brand
0,Lola,36.5,6000,Nike
1,Bosun,45.0,10000,Reebok
2,Gideon,43.5,12000,Converse
3,Eben,41.0,5500,Puma
4,Patrick,45.0,6850,Gucci
5,Aminu,43.0,7920,Nike
6,Rita,41.0,2300,Nike
7,Nick,42.5,4200,Puma
8,Dayo,46.5,7600,Puma
9,Musa,39.0,4000,Converse


In [None]:
# Q2: How many rows and columns exist in the dataframe

shoes_df.shape

In [46]:
# Q3: Get a DataFrame of shoes of Nike and Reebok brands.
#     Store your result in a new variable

nike_rebook_df = shoes_df[(shoes_df['Brand'] == 'Nike') | (shoes_df['Brand'] == 'Reebok')]

In [None]:
# Q4. Order your result above by the shoe price in descending order

nike_rebook_df.sort_values(by=['Price'], ascending=False)

In [47]:
# Q5: Derive from the main dataframe Puma and Reebok shoes of sizes above 42

shoes_df[(shoes_df['Brand'] == 'Reebok') | (shoes_df['Brand'] == 'Puma') & (shoes_df['Size'] > 42)]

Unnamed: 0,Owner,Size,Price,Brand
1,Bosun,45.0,10000,Reebok
7,Nick,42.5,4200,Puma
8,Dayo,46.5,7600,Puma
10,Joy,42.0,3650,Reebok


In [48]:
shoes_df[((shoes_df['Brand'] == 'Reebok') | (shoes_df['Brand'] == 'Puma')) & (shoes_df['Size'] > 42)]


Unnamed: 0,Owner,Size,Price,Brand
1,Bosun,45.0,10000,Reebok
7,Nick,42.5,4200,Puma
8,Dayo,46.5,7600,Puma


: 