# My Pandas basics with examples

 1. This is a notebook to accompany the **PandasCheatSheet.ipynb** I prepared for data scientce coding tests on Pandas. It contains most common functions with examples and output to learn about the functions. 
 2. If you see a function in the "My Pandas Cheat Sheet" notebook that you are not sure what it is doing, you can check here on data. I found it very helpful for my understanding


In [1]:
import pandas as pd
import numpy as np

# Read and save data

In [2]:
# df=pd.read_csv("data.csv")
# df.to_csv("data.csv")


# Generate a simple dataset for exploring

In [3]:
df = pd._testing.makeMixedDataFrame()
# Change some cell data for following manipulation¶
df.loc[4]=df.loc[3]  # Copy row 3 to row 4
df.loc[2,"A"]=np.nan # Set NaN 
df

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,3.0,1.0,foo4,2009-01-06


# Check basic information

In [4]:
df.head()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,3.0,1.0,foo4,2009-01-06


In [5]:
df.shape

(5, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       5 non-null      float64       
 2   C       5 non-null      object        
 3   D       5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 292.0+ bytes


In [7]:
df.describe()

Unnamed: 0,A,B,D
count,4.0,5.0,5
mean,1.75,0.6,2009-01-04 00:00:00
min,0.0,0.0,2009-01-01 00:00:00
25%,0.75,0.0,2009-01-02 00:00:00
50%,2.0,1.0,2009-01-05 00:00:00
75%,3.0,1.0,2009-01-06 00:00:00
max,3.0,1.0,2009-01-06 00:00:00
std,1.5,0.547723,


In [8]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,False
2,True,False,False,False
3,False,False,False,False
4,False,False,False,False


In [9]:
df.isna().sum()

A    1
B    0
C    0
D    0
dtype: int64

In [10]:
df.value_counts(dropna=False)

A    B    C     D         
3.0  1.0  foo4  2009-01-06    2
0.0  0.0  foo1  2009-01-01    1
1.0  1.0  foo2  2009-01-02    1
NaN  0.0  foo3  2009-01-05    1
Name: count, dtype: int64

# Selection 


## Slicing with [], on rows

In [11]:
a=df[::2] 
print (a)

b=df[::-1]
print (b)

c=df[:3]
print(c)

     A    B     C          D
0  0.0  0.0  foo1 2009-01-01
2  NaN  0.0  foo3 2009-01-05
4  3.0  1.0  foo4 2009-01-06
     A    B     C          D
4  3.0  1.0  foo4 2009-01-06
3  3.0  1.0  foo4 2009-01-06
2  NaN  0.0  foo3 2009-01-05
1  1.0  1.0  foo2 2009-01-02
0  0.0  0.0  foo1 2009-01-01
     A    B     C          D
0  0.0  0.0  foo1 2009-01-01
1  1.0  1.0  foo2 2009-01-02
2  NaN  0.0  foo3 2009-01-05


## .iloc[], by position, integer based indexing

In [12]:
a=df.iloc[0,1] # Return the cell value
print (a)
print ()

b=df.iloc[0] # This return a series object, not a data frame (table)
print(b)
print(type(b))
print ()

c=df.iloc[[0]] # This returns a table
print(c)
print()

d=df.iloc[[0,1]] # This returns a table with the first and second rows
print(d)
print()

e=df.iloc[:2,:2]
print (e)
print ()

f=df.iloc[-2:,-2:]
print(f)
print()

g=df.iloc[[1, 3], [1, 3]]
print(g)

0.0

A                    0.0
B                    0.0
C                   foo1
D    2009-01-01 00:00:00
Name: 0, dtype: object
<class 'pandas.core.series.Series'>

     A    B     C          D
0  0.0  0.0  foo1 2009-01-01

     A    B     C          D
0  0.0  0.0  foo1 2009-01-01
1  1.0  1.0  foo2 2009-01-02

     A    B
0  0.0  0.0
1  1.0  1.0

      C          D
3  foo4 2009-01-06
4  foo4 2009-01-06

     B          D
1  1.0 2009-01-02
3  1.0 2009-01-06


## .loc[], label based, may also be used with a boolean array
df.loc[row_indexer,column_indexer]

In [13]:
a=df.loc[0]
print (a)
print (type(a))
print ()

a1=df.loc[[0]]
print (a1)
print (type(a1))
print ()

b=df.loc[:3]
print (b)
print (type(b))

c=df.loc[:3,"A":"C"]
print(c)
print()

d=df.loc[[1,4],"C":]
print (d)
print()

e=df.loc[:,"A"]>0
print(e)
print (type(e))
print()

f=df.loc[df.loc[:,"A"]>0,:]
print (f)
print()

g=df.loc[lambda df: df.A>0,:] 
print (g)
print()

h=df.loc[lambda df: (df.A>0) & (df.D=="2009-01-02"),:] 
print(h)
print ()

i=df.loc[lambda df: (df.A>0) & (df.D=="2009-01-02"),:].assign(F=lambda df: df.A - 2, G=lambda df: df.F*2)
print(i)

A                    0.0
B                    0.0
C                   foo1
D    2009-01-01 00:00:00
Name: 0, dtype: object
<class 'pandas.core.series.Series'>

     A    B     C          D
0  0.0  0.0  foo1 2009-01-01
<class 'pandas.core.frame.DataFrame'>

     A    B     C          D
0  0.0  0.0  foo1 2009-01-01
1  1.0  1.0  foo2 2009-01-02
2  NaN  0.0  foo3 2009-01-05
3  3.0  1.0  foo4 2009-01-06
<class 'pandas.core.frame.DataFrame'>
     A    B     C
0  0.0  0.0  foo1
1  1.0  1.0  foo2
2  NaN  0.0  foo3
3  3.0  1.0  foo4

      C          D
1  foo2 2009-01-02
4  foo4 2009-01-06

0    False
1     True
2    False
3     True
4     True
Name: A, dtype: bool
<class 'pandas.core.series.Series'>

     A    B     C          D
1  1.0  1.0  foo2 2009-01-02
3  3.0  1.0  foo4 2009-01-06
4  3.0  1.0  foo4 2009-01-06

     A    B     C          D
1  1.0  1.0  foo2 2009-01-02
3  3.0  1.0  foo4 2009-01-06
4  3.0  1.0  foo4 2009-01-06

     A    B     C          D
1  1.0  1.0  foo2 2009-01-02

     

## Note:
lambda with .loc be more useful in complex operations or when chaining multiple methods together, as it allows you to pass functions dynamically.

# Data cleaning

In [14]:
# Make the DataFrame for testing
df = pd._testing.makeMixedDataFrame()
# Change some cell data for following manipulation¶
df.loc[4]=df.loc[3]  # Copy row 3 to row 4
df.loc[4,"B"]=0
df.loc[2,"A"]=np.nan # Set NaN 

# Change column or index names
df=df.rename(index=lambda x: x+1)
df=df.rename(columns={"D":"Date"})
df=df.set_index("C") # Set the DataFrame index using existing columns
df=df.reset_index()

In [15]:
# Dealing with null values
pd.isna(df)
pd.isna(df["A"])
pd.notna(df["A"])
df.dropna().reset_index() # Drops rows with null values and reset index ; default inplace=False
df.dropna(axis=1) # Drop columns that contain null values
df.dropna(thresh=1) # Drop rows that have less than thresh (1 here) non-null values
# df.fillna(0)
df["A"]=df["A"].fillna(df["A"].mean())
df

Unnamed: 0,C,A,B,Date
0,foo1,0.0,0.0,2009-01-01
1,foo2,1.0,1.0,2009-01-02
2,foo3,1.75,0.0,2009-01-05
3,foo4,3.0,1.0,2009-01-06
4,foo4,3.0,0.0,2009-01-06


# Filter, sort, groupby, and transform

In [16]:
# FILTERING
condition=df['A']>1  # Retrun a boolean Series
print(condition)
print(type(condition))
print ()

df[df['A']>1]
a=df[(df['A']>1)&(df['B']>0)]
print(a)
print()

b=df.loc[df.A>1,:] 
print(b)
print()

0    False
1    False
2     True
3     True
4     True
Name: A, dtype: bool
<class 'pandas.core.series.Series'>

      C    A    B       Date
3  foo4  3.0  1.0 2009-01-06

      C     A    B       Date
2  foo3  1.75  0.0 2009-01-05
3  foo4  3.00  1.0 2009-01-06
4  foo4  3.00  0.0 2009-01-06



In [17]:
# SORTING
a=df.sort_values("A")
print(a)
print()

b=df.sort_values(by=["A","B"],ascending=[False, True] )
print(b)

      C     A    B       Date
0  foo1  0.00  0.0 2009-01-01
1  foo2  1.00  1.0 2009-01-02
2  foo3  1.75  0.0 2009-01-05
3  foo4  3.00  1.0 2009-01-06
4  foo4  3.00  0.0 2009-01-06

      C     A    B       Date
4  foo4  3.00  0.0 2009-01-06
3  foo4  3.00  1.0 2009-01-06
2  foo3  1.75  0.0 2009-01-05
1  foo2  1.00  1.0 2009-01-02
0  foo1  0.00  0.0 2009-01-01


In [18]:
# GROUPING and TRANSFORMING
# Example of using groupby with multiple columns
data = {
    'Category': ['A', 'A', 'A', 'B', 'A','B'],
    'Subcategory': ['X', 'Y', 'X', 'Y', 'X','Y'],
    'Value1': [10, 20, 15, 25, 30, 20],
    'Value2': [11, 2, 15, 25, 35, 30]
}
df = pd.DataFrame(data)
print (df)

  Category Subcategory  Value1  Value2
0        A           X      10      11
1        A           Y      20       2
2        A           X      15      15
3        B           Y      25      25
4        A           X      30      35
5        B           Y      20      30


In [19]:
# Group by 'Category' and 'Subcategory' and calculate mean of 'Value'
grouped = df.groupby(['Category', 'Subcategory']).mean().reset_index()
print(grouped)

  Category Subcategory     Value1     Value2
0        A           X  18.333333  20.333333
1        A           Y  20.000000   2.000000
2        B           Y  22.500000  27.500000


In [20]:
 df.groupby(['Category', 'Subcategory']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value1,Value1,Value1,Value1,Value1,Value1,Value1,Value1,Value2,Value2,Value2,Value2,Value2,Value2,Value2,Value2
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Category,Subcategory,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
A,X,3.0,18.333333,10.40833,10.0,12.5,15.0,22.5,30.0,3.0,20.333333,12.858201,11.0,13.0,15.0,25.0,35.0
A,Y,1.0,20.0,,20.0,20.0,20.0,20.0,20.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0
B,Y,2.0,22.5,3.535534,20.0,21.25,22.5,23.75,25.0,2.0,27.5,3.535534,25.0,26.25,27.5,28.75,30.0


In [21]:
df.groupby(['Category', 'Subcategory']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value1,Value2
Category,Subcategory,Unnamed: 2_level_1,Unnamed: 3_level_1
A,X,3,3
A,Y,1,1
B,Y,2,2


In [22]:
df.groupby(['Category', 'Subcategory'])["Value1"].sum()

Category  Subcategory
A         X              55
          Y              20
B         Y              45
Name: Value1, dtype: int64

In [23]:
df.groupby('Category').head(2)

Unnamed: 0,Category,Subcategory,Value1,Value2
0,A,X,10,11
1,A,Y,20,2
3,B,Y,25,25
5,B,Y,20,30


In [40]:
df.groupby(['Category']).size()

Category
A    4
B    2
dtype: int64

In [38]:
df.groupby('Category').count()

Unnamed: 0_level_0,Subcategory,Value1,Value2,Value3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,4,4,4,4
B,2,2,2,2


In [24]:
a=df.groupby(['Category'])['Value1'].transform(lambda x: (x - x.mean()) / x.std())
print(a)
print(type(a))
print()

b=df.groupby(['Category'])[['Value1','Value2']].transform(lambda x: (x - x.mean()) / x.std())
print(b)

0   -1.024695
1    0.146385
2   -0.439155
3    0.707107
4    1.317465
5   -0.707107
Name: Value1, dtype: float64
<class 'pandas.core.series.Series'>

     Value1    Value2
0 -1.024695 -0.340811
1  0.146385 -0.986557
2 -0.439155 -0.053812
3  0.707107 -0.707107
4  1.317465  1.381180
5 -0.707107  0.707107


In [25]:
df.groupby(['Category'])[['Value1','Value2']].agg(['sum', 'mean', 'std']) 


Unnamed: 0_level_0,Value1,Value1,Value1,Value2,Value2,Value2
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,75,18.75,8.539126,63,15.75,13.93736
B,45,22.5,3.535534,55,27.5,3.535534


In [26]:
a=df.apply("sum")
print(a)
print()

b=df.apply(np.sum)
print(b)

Category       AAABAB
Subcategory    XYXYXY
Value1            120
Value2            118
dtype: object

Category       AAABAB
Subcategory    XYXYXY
Value1            120
Value2            118
dtype: object


In [27]:
df['Value3']=df['Value1'].apply(lambda x: x**2)
print (df)

  Category Subcategory  Value1  Value2  Value3
0        A           X      10      11     100
1        A           Y      20       2     400
2        A           X      15      15     225
3        B           Y      25      25     625
4        A           X      30      35     900
5        B           Y      20      30     400


In [28]:
# pd.pivot_table
table=pd.pivot_table(df,index=['Category'],values=['Value1','Value2'],aggfunc="sum")
table

Unnamed: 0_level_0,Value1,Value2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,75,63
B,45,55


In [29]:
table[table.index=='A']

Unnamed: 0_level_0,Value1,Value2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,75,63


# JOIN/COMBINE

In [30]:
left = pd.DataFrame(
   {
      "key1": ["K0", "K0", "K1", "K2"],
      "key2": ["K0", "K1", "K0", "K1"],
      "A": ["A0", "A1", "A2", "A3"],
      "B": ["B0", "B1", "B2", "B3"],
   }
)
right = pd.DataFrame(
   {
      "key1": ["K0", "K1", "K1", "K2"],
      "key2": ["K0", "K0", "K0", "K0"],
      "C": ["C0", "C1", "C2", "C3"],
      "D": ["D0", "D1", "D2", "D3"],
   }
)
print (left)
print(right)

  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3


In [31]:
pd.merge(left, right, how="left", on=["key1", "key2"])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


In [32]:
pd.merge(left, right, how="right", on=["key1", "key2"])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


In [33]:
left = pd.DataFrame(
    {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"]
)
print(left)

     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2


In [34]:
right = pd.DataFrame(
    {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"]
)
print(right)

     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3


In [35]:
result = left.join(right)
result

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


# Working with Datetime
With its focus on time-series data, pandas has a suite of 
tools for managing dates and time: either as a point i 
time (a Timestamp) or as a span of time (a Period)

In [36]:
date = pd.Timestamp('2013-05-01')
t2 = pd.Timestamp('2013-01-01 21:15:06') 
t3 = pd.Timestamp('Sep 04, 1982 1:35.18')
print(date)
print(type(t))
print (t2)
print(t3)

2013-05-01 00:00:00


NameError: name 't' is not defined

In [None]:
# Separate element of a datatime object from buit-in attributes
year=date.year
month=date.month
day=date.day
hour=date.hour
minute=date.minute
month_name=date.month_name()
week_day=date.weekday() # Return the day of the week as a number,counting from 0 (for monday)
day_name=date.day_name() # Return the day of the week as a text
print(month)
print(month_name)
print(week_day)
print(day_name)

In [None]:
pd.Period('2013-01', freq='M')

In [None]:
# Work on time-series data
# Make the DataFrame for testing
df=pd.DataFrame({'datetime': ["24-10-01", "24-10-02","24-10-03","24-10-04","24-10-05"],'Value':[20,50,30,40,60]})
print (df)
df.info()

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'], format="%y-%m-%d") # Convert to column 'datetime' (string object) to a datetime object
df.info()
print(df)

In [None]:
mask = (df['datetime'] >= pd.Timestamp('2024-10-02')) & (df.datetime < pd.Timestamp('2024-10-05'))
df[mask]

In [None]:
df2=df.set_index('datetime')
df2.loc['2024-10-02':'2024-10-04']

In [None]:
df2.loc['Oct 2 2024']