# Introduction to Pandas (DataFrame)

## Data Structure introduction to DataFrame

In [1]:
import pandas as pd # As "pd" alias a standard normal convention or practice use by majority. But can rename whatever you want.
import numpy as np

## DataFrame
is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. 

Like Series, DataFrame accepts many different kinds of input:

Dict of 1D ndarrays, lists, dicts, or Series

2-D numpy.ndarray

Structured or record ndarray

A Series

Another DataFrame

#### Basic method to create a dataframe
"df" is an acronym for DataFrame

df = pd.Series(data, index=index)

#### Generate data

In [2]:
# Function get_gameStat to generate a fictituous data.
def get_gameStat(size):
    df = pd.DataFrame()
    df['Position'] = np.random.choice(['Front', 'Back', 'Center'], size)
    df['Age'] = np.random.randint(18, 40, size)
    df['Team_color'] = np.random.choice(['Red', 'Blue', 'Black', 'Orange', 'White'], size)
    df['Outcome'] = np.random.choice(['Win', 'Lose', 'Tie'], size)
    df['Odds'] = np.random.uniform(0, 1, size)
    return df

#### Looking into the data

#### df.info()

In [3]:
# Generate a sample fictituos data
df = get_gameStat(10)

'''Print a concise summary of a DataFrame.
This method prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.'''
df.info()

# Arguments
'''Whether to print the full summary. By default, the setting in pandas.options.display.max_info_columns is followed.'''
# df.info(verbose = False)

'''Whether to "show_counts" the non-null counts. By default, this is shown only if the DataFrame is smaller than 
pandas.options.display.max_info_rows and pandas.options.display.max_info_columns'''

'''Specifies whether total memory usage of the DataFrame elements (including the index) should be displayed. 
By default, this follows the pandas.options.display.memory_usage setting.'''

'''True always show memory usage. False never shows memory usage.
A value of ‘deep’ is equivalent to “True with deep introspection”. 
Memory usage is shown in human-readable units (base-2 representation).
With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources.
'''
# df.info(show_counts = False, memory_usage = 'deep')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Position    10 non-null     object 
 1   Age         10 non-null     int32  
 2   Team_color  10 non-null     object 
 3   Outcome     10 non-null     object 
 4   Odds        10 non-null     float64
dtypes: float64(1), int32(1), object(3)
memory usage: 488.0+ bytes


'True always show memory usage. False never shows memory usage.\nA value of ‘deep’ is equivalent to “True with deep introspection”. \nMemory usage is shown in human-readable units (base-2 representation).\nWith deep memory introspection, a real memory usage calculation is performed at the cost of computational resources.\n'

In [4]:
# Change the Datatype of a Series.
# do not use "Category" instead use lower case "c". An Error TypeError: data type 'Category' not understood
df['Position']=df.Position.astype(dtype='object')

# Two ways of converting the Position to a category
#1 astype
# df['Position'] = df.Position.astype('category')

#2 Categorical function 
# df['Position']=pd.Categorical(df.Position)
df['Position']=pd.Categorical(df['Position'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Position    10 non-null     category
 1   Age         10 non-null     int32   
 2   Team_color  10 non-null     object  
 3   Outcome     10 non-null     object  
 4   Odds        10 non-null     float64 
dtypes: category(1), float64(1), int32(1), object(2)
memory usage: 550.0+ bytes


In [5]:
# Output will be different without the "()"
df.info

<bound method DataFrame.info of   Position  Age Team_color Outcome      Odds
0     Back   21     Orange     Win  0.964479
1    Front   22      Black    Lose  0.136885
2   Center   28       Blue     Win  0.977149
3     Back   19      Black     Win  0.089189
4   Center   38        Red    Lose  0.981001
5     Back   21       Blue     Win  0.337809
6   Center   20       Blue     Win  0.109058
7     Back   26      White     Tie  0.862672
8   Center   21      White     Tie  0.218450
9     Back   28      White    Lose  0.080355>

#### df.shape

In [6]:
'''Return a tuple representing the dimensionality of the DataFrame.
(Number of Rows, Number of Columns)."shape" is an attribute does not require a () '''

df.shape

(10, 5)

#### df.describe

In [7]:
'''Descriptive statistics include those that summarize the central tendency, 
dispersion and shape of a dataset’s distribution, excluding NaN values.'''

df.describe

<bound method NDFrame.describe of   Position  Age Team_color Outcome      Odds
0     Back   21     Orange     Win  0.964479
1    Front   22      Black    Lose  0.136885
2   Center   28       Blue     Win  0.977149
3     Back   19      Black     Win  0.089189
4   Center   38        Red    Lose  0.981001
5     Back   21       Blue     Win  0.337809
6   Center   20       Blue     Win  0.109058
7     Back   26      White     Tie  0.862672
8   Center   21      White     Tie  0.218450
9     Back   28      White    Lose  0.080355>

In [8]:
'''Descriptive statistics include those that summarize the central tendency, 
dispersion and shape of a dataset’s distribution, excluding NaN values.'''

# df.describe

# Argument
'''percentileslist-like of numbers, optional
The percentiles to include in the output. All should fall between 0 and 1. 
The default is [.25, .5, .75], which returns the 25th, 50th, and 75th percentiles.'''

# Below code describe "all" columns specifying its datatype.
df.describe(include = 'all')

Unnamed: 0,Position,Age,Team_color,Outcome,Odds
count,10,10.0,10,10,10.0
unique,3,,5,3,
top,Back,,Blue,Win,
freq,5,,3,5,
mean,,24.4,,,0.475705
std,,5.796551,,,0.413056
min,,19.0,,,0.080355
25%,,21.0,,,0.116015
50%,,21.5,,,0.27813
75%,,27.5,,,0.939027


In [9]:
# Show only percentile as listed.
df.describe(percentiles = [0.25, 0.5, 0.75])
# df.describe(percentiles = [0.25, 0.5, 0.75], exclude=[object])

Unnamed: 0,Age,Odds
count,10.0,10.0
mean,24.4,0.475705
std,5.796551,0.413056
min,19.0,0.080355
25%,21.0,0.116015
50%,21.5,0.27813
75%,27.5,0.939027
max,38.0,0.981001


In [10]:
# Shows columns only numeric datatype.
df.describe(include = [np.number])

Unnamed: 0,Age,Odds
count,10.0,10.0
mean,24.4,0.475705
std,5.796551,0.413056
min,19.0,0.080355
25%,21.0,0.116015
50%,21.5,0.27813
75%,27.5,0.939027
max,38.0,0.981001


In [11]:
# Including or Excluding a particular column/s
# df.describe(include=['category'])

'''Below code will exclude the column Position since it is a category.
But will show Team_color and Outcome since datatype is an object'''
df.describe(exclude=['category'])

Unnamed: 0,Age,Team_color,Outcome,Odds
count,10.0,10,10,10.0
unique,,5,3,
top,,Blue,Win,
freq,,3,5,
mean,24.4,,,0.475705
std,5.796551,,,0.413056
min,19.0,,,0.080355
25%,21.0,,,0.116015
50%,21.5,,,0.27813
75%,27.5,,,0.939027


#### df.head() / df.tail()

In [12]:
'''This function returns the first n rows for the object based on position.
It is useful for quickly testing if your object has the right type of data in it.'''

# n : int, default 5
# df.head()
df.head(8)

Unnamed: 0,Position,Age,Team_color,Outcome,Odds
0,Back,21,Orange,Win,0.964479
1,Front,22,Black,Lose,0.136885
2,Center,28,Blue,Win,0.977149
3,Back,19,Black,Win,0.089189
4,Center,38,Red,Lose,0.981001
5,Back,21,Blue,Win,0.337809
6,Center,20,Blue,Win,0.109058
7,Back,26,White,Tie,0.862672


In [13]:
'''This function returns last n rows from the object based on position. 
It is useful for quickly verifying data, for example, after sorting or appending rows.'''

# n : int, default 5
# df.tail()
df.tail(8)

Unnamed: 0,Position,Age,Team_color,Outcome,Odds
2,Center,28,Blue,Win,0.977149
3,Back,19,Black,Win,0.089189
4,Center,38,Red,Lose,0.981001
5,Back,21,Blue,Win,0.337809
6,Center,20,Blue,Win,0.109058
7,Back,26,White,Tie,0.862672
8,Center,21,White,Tie,0.21845
9,Back,28,White,Lose,0.080355


Data Type	Size	Description
byte	   1 byte	Stores whole numbers from -128 to 127
short	   2 bytes	Stores whole numbers from -32,768 to 32,767
int	       4 bytes	Stores whole numbers from -2,147,483,648 to 2,147,483,647
long	   8 bytes	Stores whole numbers from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
float	   4 bytes	Stores fractional numbers. Sufficient for storing 6 to 7 decimal digits
double	   8 bytes	Stores fractional numbers. Sufficient for storing 15 decimal digits
boolean	   1 bit	Stores true or false values
char	   2 bytes	Stores a single character/letter or ASCII values

#### df.iloc / df.loc / df.iat / df.at

In [14]:
df

Unnamed: 0,Position,Age,Team_color,Outcome,Odds
0,Back,21,Orange,Win,0.964479
1,Front,22,Black,Lose,0.136885
2,Center,28,Blue,Win,0.977149
3,Back,19,Black,Win,0.089189
4,Center,38,Red,Lose,0.981001
5,Back,21,Blue,Win,0.337809
6,Center,20,Blue,Win,0.109058
7,Back,26,White,Tie,0.862672
8,Center,21,White,Tie,0.21845
9,Back,28,White,Lose,0.080355


In [15]:
# position selection by number or integer index
df.iloc[1]

Position         Front
Age                 22
Team_color       Black
Outcome           Lose
Odds          0.136885
Name: 1, dtype: object

In [16]:
# Almost the same as above code of df.iloc[7]
df.loc[7]

Position          Back
Age                 26
Team_color       White
Outcome            Tie
Odds          0.862672
Name: 7, dtype: object

In [17]:
# Generate sample data
dates = pd.date_range('1/1/2023', periods=5)

# Generate a 12 by 4 array between integer 0 to 7 (exclude number 8)
# np.random.randint(8, size=(12, 4)), 
#                   index=dates, 
#                   columns=['Index', 'A', 'B', 'C', 'D'])

# Create a sample DataFrame
df2 = pd.DataFrame(np.random.randint(8, size=(5, 4)),
                  index=dates, 
                  columns=['A', 'B', 'C', 'D']).rename_axis('Datetime', axis=1)

df2

Datetime,A,B,C,D
2023-01-01,3,3,0,4
2023-01-02,1,7,5,3
2023-01-03,1,7,7,1
2023-01-04,3,0,6,2
2023-01-05,5,1,5,5


In [18]:
# Get the row data of index '2023-01-03' of all columns
df2.loc['2023-01-03']

Datetime
A    1
B    7
C    7
D    1
Name: 2023-01-03 00:00:00, dtype: int32

In [19]:
# Return a bool of row data '2023-01-03' that is less than 5.
df2.loc['2023-01-03'] < 5

Datetime
A     True
B    False
C    False
D     True
Name: 2023-01-03 00:00:00, dtype: bool

In [20]:
# Get rows of column "A" less than 5
df2.loc[df2['A'] < 5]

Datetime,A,B,C,D
2023-01-01,3,3,0,4
2023-01-02,1,7,5,3
2023-01-03,1,7,7,1
2023-01-04,3,0,6,2


In [21]:
df2.at

<pandas.core.indexing._AtIndexer at 0x2861380a7a0>

In [22]:
position = '2023-01-03'
label = 'A'

# Row and column pair. Looking in row 2023-01-03 at column "A"
df2.at[position, label] # Result 0

1

In [23]:
# Row column pair index
# Row index 2023-01-03 and column C
df2.iat[2, 2] # 7

# Row index 2023-01-02 at column D
df2.iat[1, 3] # 5

3