# PANDAS CHEAT SHEET

***Panda's is a library that provides easy to use data structure and data analysis tool for Python Programming Language***

**1. Import Convention for Pandas**

In [1]:
import numpy as np
import pandas as pd

**2. Pandas Data Structure**

**2.1. Series**

In [2]:
s = pd.Series([1,2,3,4], index=['a','b','c','d'])
s

a    1
b    2
c    3
d    4
dtype: int64

**2.2. Data Frame**

In [3]:
data_fruit = {'Fruit': ['Apple', 'Orange'], 'Color' : ['Red', 'Orange']}
df = pd.DataFrame(data_fruit, columns = ['Fruit', 'Color'])
df

Unnamed: 0,Fruit,Color
0,Apple,Red
1,Orange,Orange


**3. Importing Data**

**pd.read_csv(“filename”)**

**pd.read_table(“filename”)**

**pd.read_excel(“filename”)**

**pd.read_sql(query, connection_object)**

**4. Exporting Data**

**df.to_csv(“filename”)**

**df.to_excel(“filename”)**

**df.to_sql(table_name, connection_object)**

**4. Creating Test Data**

In [4]:
df=pd.DataFrame(np.random.rand(10,4),columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,0.64354,0.334191,0.379616,0.370919
1,0.700844,0.365643,0.385233,0.069759
2,0.201622,0.179023,0.144862,0.717728
3,0.33775,0.956541,0.159898,0.668184
4,0.382504,0.735635,0.432578,0.31059
5,0.645458,0.622858,0.978791,0.2401
6,0.56772,0.647658,0.03347,0.00582
7,0.518737,0.161844,0.231548,0.260802
8,0.845187,0.565228,0.658303,0.169912
9,0.527357,0.56719,0.919655,0.439795


In [5]:
df1=pd.Series(np.random.rand(5))
df1

0    0.086136
1    0.737258
2    0.365869
3    0.868327
4    0.519049
dtype: float64

**5. Operations on Data Frame**

**5.1. View DataFrame Contents**

In [6]:
#Look at first n rows of data frame
df.head()

Unnamed: 0,A,B,C,D
0,0.64354,0.334191,0.379616,0.370919
1,0.700844,0.365643,0.385233,0.069759
2,0.201622,0.179023,0.144862,0.717728
3,0.33775,0.956541,0.159898,0.668184
4,0.382504,0.735635,0.432578,0.31059


In [7]:
#Look at the last n rows of data frame
df.tail()

Unnamed: 0,A,B,C,D
5,0.645458,0.622858,0.978791,0.2401
6,0.56772,0.647658,0.03347,0.00582
7,0.518737,0.161844,0.231548,0.260802
8,0.845187,0.565228,0.658303,0.169912
9,0.527357,0.56719,0.919655,0.439795


In [8]:
#Gives the number of rows and columns
df.shape

(10, 4)

In [9]:
#Information of index, data type and memory
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       10 non-null     float64
 1   B       10 non-null     float64
 2   C       10 non-null     float64
 3   D       10 non-null     float64
dtypes: float64(4)
memory usage: 448.0 bytes


In [10]:
#Summary Statistics of numerical columns
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.537072,0.513581,0.432395,0.325361
std,0.189255,0.251814,0.324624,0.233312
min,0.201622,0.161844,0.03347,0.00582
25%,0.416562,0.342054,0.177811,0.187459
50%,0.547539,0.566209,0.382424,0.285696
75%,0.644978,0.641458,0.601872,0.422576
max,0.845187,0.956541,0.978791,0.717728


**5.2. Selection of Data Frame Contents**

**iloc Function**

In [11]:
#Select first row of Data Frame
df.iloc[0]

A    0.643540
B    0.334191
C    0.379616
D    0.370919
Name: 0, dtype: float64

In [12]:
#Select second row of Data Frame
df.iloc[1]

A    0.700844
B    0.365643
C    0.385233
D    0.069759
Name: 1, dtype: float64

In [13]:
#Select last row of Data Frame
df.iloc[-1]

A    0.527357
B    0.567190
C    0.919655
D    0.439795
Name: 9, dtype: float64

In [14]:
#Select first column of Data Frame
df.iloc[:,0]

0    0.643540
1    0.700844
2    0.201622
3    0.337750
4    0.382504
5    0.645458
6    0.567720
7    0.518737
8    0.845187
9    0.527357
Name: A, dtype: float64

In [15]:
#Select second column of Data Frame
df.iloc[:,1]

0    0.334191
1    0.365643
2    0.179023
3    0.956541
4    0.735635
5    0.622858
6    0.647658
7    0.161844
8    0.565228
9    0.567190
Name: B, dtype: float64

**loc Function**

In [16]:
#Select single value by row position and column labels
df.loc[5,'B']

0.6228575514729365

In [17]:
#Select and slicing on labels
df.loc[0:5, 'A':'C']

Unnamed: 0,A,B,C
0,0.64354,0.334191,0.379616
1,0.700844,0.365643,0.385233
2,0.201622,0.179023,0.144862
3,0.33775,0.956541,0.159898
4,0.382504,0.735635,0.432578
5,0.645458,0.622858,0.978791


**6. Sorting of Data Frame**

In [18]:
#Sort by labels along the index
df.sort_index()

Unnamed: 0,A,B,C,D
0,0.64354,0.334191,0.379616,0.370919
1,0.700844,0.365643,0.385233,0.069759
2,0.201622,0.179023,0.144862,0.717728
3,0.33775,0.956541,0.159898,0.668184
4,0.382504,0.735635,0.432578,0.31059
5,0.645458,0.622858,0.978791,0.2401
6,0.56772,0.647658,0.03347,0.00582
7,0.518737,0.161844,0.231548,0.260802
8,0.845187,0.565228,0.658303,0.169912
9,0.527357,0.56719,0.919655,0.439795


In [19]:
#Sorts values by column 1 in ascending order
df.sort_values('A')

Unnamed: 0,A,B,C,D
2,0.201622,0.179023,0.144862,0.717728
3,0.33775,0.956541,0.159898,0.668184
4,0.382504,0.735635,0.432578,0.31059
7,0.518737,0.161844,0.231548,0.260802
9,0.527357,0.56719,0.919655,0.439795
6,0.56772,0.647658,0.03347,0.00582
0,0.64354,0.334191,0.379616,0.370919
5,0.645458,0.622858,0.978791,0.2401
1,0.700844,0.365643,0.385233,0.069759
8,0.845187,0.565228,0.658303,0.169912


In [20]:
#Sorts values by column in descending order
df.sort_values('A', ascending=False)

Unnamed: 0,A,B,C,D
8,0.845187,0.565228,0.658303,0.169912
1,0.700844,0.365643,0.385233,0.069759
5,0.645458,0.622858,0.978791,0.2401
0,0.64354,0.334191,0.379616,0.370919
6,0.56772,0.647658,0.03347,0.00582
9,0.527357,0.56719,0.919655,0.439795
7,0.518737,0.161844,0.231548,0.260802
4,0.382504,0.735635,0.432578,0.31059
3,0.33775,0.956541,0.159898,0.668184
2,0.201622,0.179023,0.144862,0.717728


In [21]:
#Returns a group by object values from multiple columns
df.groupby(['A', 'D'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C4C65FC6A0>

In [22]:
#To understand what the groupby object produces, we will call the mean method to aggregate the data by calculating a mean.
df.groupby('A')['D'].mean()

A
0.201622    0.717728
0.337750    0.668184
0.382504    0.310590
0.518737    0.260802
0.527357    0.439795
0.567720    0.005820
0.643540    0.370919
0.645458    0.240100
0.700844    0.069759
0.845187    0.169912
Name: D, dtype: float64

**7. Functions**

**7.1. Mean**

In [23]:
#Mean of all columns
df.mean()

A    0.537072
B    0.513581
C    0.432395
D    0.325361
dtype: float64

**7.2. Median**

In [24]:
#Median of all columns
df.median()

A    0.547539
B    0.566209
C    0.382424
D    0.285696
dtype: float64

**7.3. Standard Deviation**

In [25]:
#Standard Deviation of all columns
df.std()

A    0.189255
B    0.251814
C    0.324624
D    0.233312
dtype: float64

**7.4. Maximum**

In [26]:
#Highest Value in each column
df.max()

A    0.845187
B    0.956541
C    0.978791
D    0.717728
dtype: float64

**7.5 Minimum**

In [27]:
#Lowest value in each column
df.min()

A    0.201622
B    0.161844
C    0.033470
D    0.005820
dtype: float64

**7.6. Count**

In [28]:
#No of non null values in each columns
df.count()

A    10
B    10
C    10
D    10
dtype: int64

**7.7. Describe**

In [29]:
#Summary Statistics of each column
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.537072,0.513581,0.432395,0.325361
std,0.189255,0.251814,0.324624,0.233312
min,0.201622,0.161844,0.03347,0.00582
25%,0.416562,0.342054,0.177811,0.187459
50%,0.547539,0.566209,0.382424,0.285696
75%,0.644978,0.641458,0.601872,0.422576
max,0.845187,0.956541,0.978791,0.717728


**8. Plotting**

**8.1. Histogram**

df.plot.hist()

**8.2. Scatter Plot**

df.plot.scatter(x='column1', y=column2')