# Introduction to Pandas

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/pandas.png" alt="numpy logo" width = "300">

[Pandas](https://pandas.pydata.org/) is built on NumPy and provides easy-to-use
data structures and data analysis tools for the Python
programming language.

## Install and import Pandas

`
$ pip install pandas
`

In [None]:
# Import Pandas convention
import pandas as pd

## Pandas Data Structures

### Series

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/serie.png" alt="numpy logo" >

A **one-dimensional** labeled array a capable of holding any data type.

In [None]:
# Import pandas
import pandas as pd

# Create a pandas Series representing monthly sales data
sales_data = pd.Series(
    [1500, 1200, 1800, 1600, 1300, 1700, 1400, 1500, 1600, 1800],
    index=['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct']
)

# Print the pandas Series
print("Monthly Sales Data:")
print(sales_data)
print(type(sales_data))

### DataFrame

<img src="https://raw.githubusercontent.com/fralfaro/DS-Cheat-Sheets/main/docs/examples/pandas/df.png" alt="numpy logo" >

**two-dimensional** labeled data structure with columns of potentially different types.

In [None]:
# Create a pandas DataFrame with more instances
data = {
    'country': ['United States', 'China', 'Japan', 'Germany', 'United Kingdom', 'India', 'France', 'Italy', 'Brazil', 'Canada'],
    'capital': ['Washington, D.C.', 'Beijing', 'Tokyo', 'Berlin', 'London', 'New Delhi', 'Paris', 'Rome', 'Brasília', 'Ottawa'],
    'population': [331449281, 1393000000, 126476461, 83783945, 67886011, 1303171035, 67186600, 60277900, 211050000, 37742154],
    'GDP': [21.44, 14.34, 5.07, 4.01, 2.99, 3.11, 2.78, 2.15, 1.77, 1.73]
}
df = pd.DataFrame(
    data,
    columns=['country', 'capital', 'population', 'GDP']
)

# Print the DataFrame 'df'
print("\ndf:")
df

In [None]:
import pandas as pd

# Original data structure
data_list_of_dicts = [
    {"country": "United States","capital": "Washington, D.C.","population": 331449281,"GDP": 21.44,},
    {"country": "China", "capital": "Beijing", "population": 1393000000, "GDP": 14.34},
    {"country": "Japan", "capital": "Tokyo", "population": 126476461, "GDP": 5.07},
    {"country": "Germany", "capital": "Berlin", "population": 83783945, "GDP": 4.01},
    {"country": "United Kingdom","capital": "London","population": 67886011,"GDP": 2.99},
    {"country": "India", "capital": "New Delhi", "population": 1303171035, "GDP": 3.11},
    {"country": "France", "capital": "Paris", "population": 67186600, "GDP": 2.78},
    {"country": "Italy", "capital": "Rome", "population": 60277900, "GDP": 2.15},
    {"country": "Brazil", "capital": "Brasília", "population": 211050000, "GDP": 1.77},
    {"country": "Canada", "capital": "Ottawa", "population": 37742154, "GDP": 1.73},
]

# Creating DataFrame from list of dictionaries
df = pd.DataFrame(
    data_list_of_dicts, columns=["country", "capital", "population", "GDP"]
)
df.sample()

# How to read Data

## Read csv files

In [None]:
import pandas as pd

In [36]:
covid_df = pd.read_csv('./data/covid19-og.csv')
covid_df

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
0,14/04/2020,14,4,2020,58,3,Afghanistan,AF,AFG,37172386.0
1,13/04/2020,13,4,2020,52,0,Afghanistan,AF,AFG,37172386.0
2,12/04/2020,12,4,2020,34,3,Afghanistan,AF,AFG,37172386.0
3,11/04/2020,11,4,2020,37,0,Afghanistan,AF,AFG,37172386.0
4,10/04/2020,10,4,2020,61,1,Afghanistan,AF,AFG,37172386.0
...,...,...,...,...,...,...,...,...,...,...
10737,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10738,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0
10739,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
10740,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0


In [43]:
# Load the first 10 rows of the AirBnb NYC 2019 dataset for quick inspection
nyc_df = pd.read_csv('./data/AirBnb_NYC_2019.csv',index_col=0)
nyc_df

Unnamed: 0_level_0,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [47]:
# Load the dataset with multi-level indices and headers
warehouse_df = pd.read_csv('./data/multi_index_warehouses.csv', index_col=[0,1,2], header=[0,1])
warehouse_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2010,2010,2011,2011,2012,2012,2013,2013,2014,2014,2015,2015,2016,2016,2017,2017,2018,2018,2019,2019
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec,Jan-Jun,Jul-Dec
NY Warehouses,Buffalo,Mobile,26,12,10,23,18,10,10,26,16,18,20,21,20,26,29,20,11,21,25,16
NY Warehouses,Buffalo,TV,19,22,27,19,27,12,24,28,27,28,10,16,25,26,20,25,10,27,20,20
NY Warehouses,Buffalo,AC,16,24,20,23,29,15,10,20,16,16,25,21,19,12,21,19,11,28,19,19
NY Warehouses,Ithaca,Mobile,10,28,27,22,18,14,22,12,14,16,21,13,27,17,15,19,21,15,29,14
NY Warehouses,Ithaca,TV,13,13,11,15,12,15,27,17,10,25,20,27,17,16,13,23,15,26,15,28
NY Warehouses,Ithaca,AC,18,17,19,28,28,14,21,18,25,17,18,27,23,24,22,12,11,12,19,22
NY Warehouses,Beacon,Mobile,10,17,24,27,25,11,22,26,10,13,25,13,29,23,28,22,22,26,20,18
NY Warehouses,Beacon,TV,12,22,26,26,15,28,15,26,12,18,17,10,21,19,24,23,26,19,19,13
NY Warehouses,Beacon,AC,13,11,23,13,20,26,10,12,14,10,28,26,21,26,29,12,29,18,20,24
CA Warehouses,San Francisco,Mobile,15,25,18,16,13,13,19,15,21,23,11,26,27,16,16,18,29,22,25,20


## Read Excel files

In [54]:
pd.read_excel('./data/covid19.xlsx',sheet_name="gre")


Unnamed: 0,name,gre
0,jack,325
1,anna,329
2,,300
3,jasmine,338


## Read JSON files

In [55]:
pd.read_json('./data/admits.json')

Unnamed: 0,gpa,gre,toefl,workex,research,admit
0,6.80,326.0,106,0,3+,0
1,8.24,305.0,114,3.5,0,1
2,6.56,312.0,116,1,2,1
3,7.62,326.0,107,3,3,1
4,6.01,314.0,87,2,2,1
...,...,...,...,...,...,...
95,4.96,303.0,92,0.5,2,0
96,6.13,323.0,119,5+,1,1
97,8.65,333.0,119,2.5,2,1
98,5.79,303.0,91,5,none,1


# Editing the DF

## Renaming indices/columns

In [None]:
df.rename({"US": "zero"}, axis=0)

In [None]:
df.rename({"population": "Population_number"}, axis=1)

## Getting Elements


In [None]:
# Get one element from a Series
sales_data['jan']

# another way to do it
sales_data.jan

In [None]:
# Get subset of a DataFrame
df[1:]

## Dropping


In [None]:
# Drop values from rows (axis=0)
sales_data.drop(['may', 'mar'])

In [None]:
# Drop values from columns (axis=1)
df.drop('country', axis=1)

## Applying Functions


In [None]:
# Define a function
f = lambda x: x*2

In [None]:
# Apply function to DataFrame
df.apply(f)

In [None]:
# Apply function element-wise
df.applymap(f)

In [None]:

df["country"] = df["country"].apply(lambda x: x.upper())

df

## TQDM with pandas

In [None]:
import time
def placeholder_function(x):
    time.sleep(0.5)
    return x.upper()

In [None]:
from tqdm import tqdm
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

df["country"] = df["country"].progress_apply(placeholder_function)

df

In [None]:
# Even better progress bar
from tqdm.auto import tqdm
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

df["country"] = df["country"].progress_apply(placeholder_function)

df

## Basic Information


In [None]:
# Get the shape (rows, columns)
df.shape

In [None]:
# Describe index
df.index

In [None]:
# Describe DataFrame columns
df.columns

In [None]:
# Info on DataFrame
df.info()

In [None]:
# Number of non-NA values
df.count()

In [None]:
df["country"].value_counts()

## Summary

In [None]:
# Sum of values
sum_values = df['population'].sum()

# Cumulative sum of values
cumulative_sum_values = df['population'].cumsum()

# Minimum/maximum values
min_values = df['population'].min()
max_values = df['population'].max()

# Index of minimum/maximum values
idx_min_values = df['population'].idxmin()
idx_max_values = df['population'].idxmax()

# Summary statistics
summary_stats = df['population'].describe()

# Mean of values
mean_values = df['population'].mean()

# Median of values
median_values = df['population'].median()

print("Example DataFrame:")
print(df)

print("\nSum of values:")
print(sum_values)

print("\nCumulative sum of values:")
print(cumulative_sum_values)

print("\nMinimum values:")
print(min_values)

print("\nMaximum values:")
print(max_values)

print("\nIndex of minimum values:")
print(idx_min_values)

print("\nIndex of maximum values:")
print(idx_max_values)

print("\nSummary statistics:")
print(summary_stats)

print("\nMean values:")
print(mean_values)

print("\nMedian values:")
print(median_values)

In [None]:
# easier way to get the summaries
df.describe()
# df.describe().T

## Introduction to data profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report",explorative=True)
# profile.to_widgets()
# profile.to_notebook_iframe()
# profile.to_file("your_report.html")