In [1]:
import pandas as pd
import numpy as np 

### pandas Series and DataFrame

In [4]:
# pandas series is a column
pd.Series([21,23,34,54,24,23],name='age',index=['a','b','c','d','e','f'])

a    21
b    23
c    34
d    54
e    24
f    23
Name: age, dtype: int64

In [7]:
# pandas dataframe is collection of columns
df = pd.DataFrame({'name':['a','b','c','d','e','f'],'age':[21,23,34,54,24,23]})
df

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54
4,e,24
5,f,23


### Saving dataframe into different file formats and then reading data from those file formats

In [16]:
# copies dataframe to clipboard, then read data from clipboard
df.to_clipboard()
pd.read_clipboard()

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54
4,e,24
5,f,23


In [9]:
df.to_csv('t1.csv')

In [17]:
pd.read_csv('t1.csv')

Unnamed: 0.1,Unnamed: 0,name,age
0,0,a,21
1,1,b,23
2,2,c,34
3,3,d,54
4,4,e,24
5,5,f,23


In [18]:
# making Unnamed: 0 as index column
pd.read_csv('t1.csv',index_col='Unnamed: 0')

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54
4,e,24
5,f,23


In [10]:
df.to_json('t1.json')

In [19]:
pd.read_json('t1.json')

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54
4,e,24
5,f,23


In [11]:
df.to_html('t1.html')

In [21]:
pd.read_html('t1.html',index_col='Unnamed: 0')

[  name  age
 0    a   21
 1    b   23
 2    c   34
 3    d   54
 4    e   24
 5    f   23]

In [14]:
# can be saved to specific sheet
df.to_excel('t1.xlsx',sheet_name='A')

In [25]:
pd.read_excel('t1.xlsx',sheet_name='A',index_col='Unnamed: 0')

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54
4,e,24
5,f,23


In [15]:
df.to_xml('t1.xml')

In [32]:
pd.read_xml('t1.xml')

Unnamed: 0,index,name,age
0,0,a,21
1,1,b,23
2,2,c,34
3,3,d,54
4,4,e,24
5,5,f,23


In [33]:
# save dataframe to sql table
import sqlite3 as sq3
with sq3.connect('mydatabase.sqlite3') as conn:
    df.to_sql('t1',conn)

In [34]:
# reading data from table t1 
with sq3.connect('mydatabase.sqlite3') as conn:
    dft = pd.read_sql(""" SELECT * FROM t1""",conn)
dft

Unnamed: 0,index,name,age
0,0,a,21
1,1,b,23
2,2,c,34
3,3,d,54
4,4,e,24
5,5,f,23


### dataframe operations

In [2]:
df = pd.read_csv('t1.csv',index_col='Unnamed: 0')
df.head(3)

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34


In [3]:
df.tail(3)

Unnamed: 0,name,age
3,d,54
4,e,24
5,f,23


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    6 non-null      object
 1   age     6 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 144.0+ bytes


In [5]:
#df.select_dtypes('object')
df.select_dtypes('int64')

Unnamed: 0,age
0,21
1,23
2,34
3,54
4,24
5,23


In [6]:
# single Bracket will return series and double Bracket will return dataframe
#df['age']
df[['age','name']]

Unnamed: 0,age,name
0,21,a
1,23,b
2,34,c
3,54,d
4,24,e
5,23,f


In [151]:
# copy dataframe to another dataframe
df1 = df.copy()
df1

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54
4,e,24
5,f,23


In [152]:
# temporary drop
df1.drop([4,5])

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54


In [153]:
# parmanent drop
df1.drop([4,5], inplace=True)

In [154]:
df1

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54


In [155]:
# drop duplicate rows
df1.drop_duplicates()

Unnamed: 0,name,age
0,a,21
1,b,23
2,c,34
3,d,54


In [156]:
# df1.name and df1['name'] returns series, df1[['name']] returns dataframe
#df1.name
#df1['name']
df1[['name']]

Unnamed: 0,name
0,a
1,b
2,c
3,d


In [157]:
# applying lambda function to column name to make records uppercase, temporary change
df1.name.apply(lambda x: x.upper()) 

0    A
1    B
2    C
3    D
Name: name, dtype: object

In [158]:
# parmanent change
# df1 = df1.name.apply(lambda x: x.upper()) turns dataframe into series
# below one remains in dataframe 
df1.name = df1.name.apply(lambda x: x.upper())
df1

Unnamed: 0,name,age
0,A,21
1,B,23
2,C,34
3,D,54


In [159]:
df1.columns

Index(['name', 'age'], dtype='object')

In [160]:
df1.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [161]:
df1.reset_index(inplace=True)

In [162]:
df1

Unnamed: 0,index,name,age
0,0,A,21
1,1,B,23
2,2,C,34
3,3,D,54


In [163]:
# taking all rows from 2nd column
df1.values[:,1]

array(['A', 'B', 'C', 'D'], dtype=object)

In [164]:
df1.age = df1.age*2
df1

Unnamed: 0,index,name,age
0,0,A,42
1,1,B,46
2,2,C,68
3,3,D,108


In [165]:
# df1 = df1[['age']]/2 will only select age column and discards other columns
df1[['age']] = df1[['age']]/2
df1

Unnamed: 0,index,name,age
0,0,A,21.0
1,1,B,23.0
2,2,C,34.0
3,3,D,54.0


In [174]:
# converting dataframe to numpy array
df2 = df1.copy()
# df2.values only takes values, don't take column names
df2.values

array([[0, 'A', 21.0],
       [1, 'B', 23.0],
       [2, 'C', 34.0],
       [3, 'D', 54.0]], dtype=object)

In [176]:
df2.columns

Index(['index', 'name', 'age'], dtype='object')

In [177]:
df2.index

RangeIndex(start=0, stop=4, step=1)

In [182]:
df2.to_numpy()

array([[0, 'A', 21.0],
       [1, 'B', 23.0],
       [2, 'C', 34.0],
       [3, 'D', 54.0]], dtype=object)

In [183]:
df2[['age']].to_numpy()

array([[21.],
       [23.],
       [34.],
       [54.]])

In [184]:
df2 = df2.to_numpy()
df2

array([[0, 'A', 21.0],
       [1, 'B', 23.0],
       [2, 'C', 34.0],
       [3, 'D', 54.0]], dtype=object)

In [185]:
# converting numpy array to dataframe
df2 = pd.DataFrame(df2)
df2

Unnamed: 0,0,1,2
0,0,A,21.0
1,1,B,23.0
2,2,C,34.0
3,3,D,54.0


In [191]:
df1.dtypes

index      int64
name      object
age      float64
dtype: object

### Slicing

In [219]:
df1

Unnamed: 0,index,name,age
0,0,A,21.0
1,1,B,23.0
2,2,C,34.0
3,3,D,54.0


In [198]:
# shows values in 2nd index
df1.loc[2]

index       2
name        C
age      34.0
Name: 2, dtype: object

In [193]:
df1.loc[[2]]

Unnamed: 0,index,name,age
2,2,C,34.0


In [200]:
df1.loc[2,'name']

'C'

In [226]:
# loc takes last limit number, iloc don't
df1.loc[0:2,:]

Unnamed: 0,index,name,age
0,0,A,21.0
1,1,B,23.0
2,2,C,34.0


In [203]:
# iloc works as numpy array, iloc don't take column name, takes column index number
df1.iloc[0,2]

21.0

In [222]:
df1.iloc[:,]

Unnamed: 0,index,name,age
0,0,A,21.0
1,1,B,23.0
2,2,C,34.0
3,3,D,54.0


In [224]:
df1.iloc[:,0:3]

Unnamed: 0,index,name,age
0,0,A,21.0
1,1,B,23.0
2,2,C,34.0
3,3,D,54.0


In [225]:
df1.iloc[0:2,:]

Unnamed: 0,index,name,age
0,0,A,21.0
1,1,B,23.0


In [205]:
df1.age > 21

0    False
1     True
2     True
3     True
Name: age, dtype: bool

In [209]:
df1[df1.age > 21]

Unnamed: 0,index,name,age
1,1,B,23.0
2,2,C,34.0
3,3,D,54.0


In [210]:
df1[df1.age%2==0]

Unnamed: 0,index,name,age
2,2,C,34.0
3,3,D,54.0


In [212]:
df1[(df1.age > 21) & (df1.age<50)]

Unnamed: 0,index,name,age
1,1,B,23.0
2,2,C,34.0


### filter

In [214]:
df1.filter(items=['age','name'])

Unnamed: 0,age,name
0,21.0,A
1,23.0,B
2,34.0,C
3,54.0,D


In [215]:
df1.filter(like='ge')

Unnamed: 0,age
0,21.0
1,23.0
2,34.0
3,54.0


In [216]:
# ^a means starting with a
df1.filter(regex='^a')

Unnamed: 0,age
0,21.0
1,23.0
2,34.0
3,54.0


In [217]:
# me$ means end with me
df1.filter(regex='me$')

Unnamed: 0,name
0,A
1,B
2,C
3,D


In [218]:
df1.name.apply(lambda name: name.startswith('A'))

0     True
1    False
2    False
3    False
Name: name, dtype: bool

In [227]:
df1.where(df1.age>24)

Unnamed: 0,index,name,age
0,,,
1,,,
2,2.0,C,34.0
3,3.0,D,54.0


In [228]:
# dropna() by default drops row 
df1.where(df1.age>24).dropna()

Unnamed: 0,index,name,age
2,2.0,C,34.0
3,3.0,D,54.0


In [230]:
# dropna(axis=1) drops all columns containing null value
df1.where(df1.age>24).dropna(axis=1)

0
1
2
3


In [231]:
# fillna() used to fill missing value, here missing value are filled with 18
df1.where(df1.age>24)[['age']].fillna(18)

Unnamed: 0,age
0,18.0
1,18.0
2,34.0
3,54.0


In [232]:
# method='bfill' means fill missing value with value from row next to missing value row
df1.where(df1.age>24)[['age']].fillna(method='bfill')

Unnamed: 0,age
0,34.0
1,34.0
2,34.0
3,54.0


In [233]:
# method='ffill' means foeward fill
df1.where(df1.age>24)[['age']].fillna(method='ffill')

Unnamed: 0,age
0,
1,
2,34.0
3,54.0


In [234]:
# append new rows in df1 dataframe
df.append(pd.DataFrame({
    'index':[4,5],
    'name':['E','F'],
    'age':[33,34]
}))

  df.append(pd.DataFrame({


Unnamed: 0,name,age,index
0,a,21,
1,b,23,
2,c,34,
3,d,54,
4,e,24,
5,f,23,
0,E,33,4.0
1,F,34,5.0


In [236]:
df3 = pd.DataFrame({
    'gender':['M','F','M','F']
})

In [237]:
df3

Unnamed: 0,gender
0,M
1,F
2,M
3,F


In [238]:
# merging dataframe
pd.concat([df1,df3],axis=1)

Unnamed: 0,index,name,age,gender
0,0,A,21.0,M
1,1,B,23.0,F
2,2,C,34.0,M
3,3,D,54.0,F
