- ## .astype()
- ## .between()
- ## .clip()
- ## .duplicated()
- ## .drop_duplicates()
- ## .isnull()
- ## .dropna()
- ## .fillna()
- ## .isin()
- ## .apply()

In [90]:
import pandas as pd

movies = pd.read_csv("./datasets/bollywood.csv",index_col="movie").squeeze("columns")
movies.head()

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
Name: lead, dtype: object

In [93]:
vk = pd.read_csv("datasets/kohli_ipl.csv",index_col="match_no").squeeze("columns")
vk.head()

match_no
1     1
2    23
3    13
4    12
5     1
Name: runs, dtype: int64

## .astype()

In [94]:
# check size
import sys
sys.getsizeof(vk)

3456

In [97]:
# now change size
change_size = vk.astype("int16")
change_size

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int16

In [98]:
sys.getsizeof(change_size)

2166

## .between()

In [108]:
# check range numbers between range 
mask = vk.between(80,100) # 80 and 100 are included
mask

match_no
1      False
2      False
3      False
4      False
5      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: runs, Length: 215, dtype: bool

In [107]:
vk[mask]

match_no
81      93
82      99
110     82
119     80
120    100
145     92
160     84
164    100
178     90
Name: runs, dtype: int64

## .clip()

In [110]:
vk

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int64

In [109]:
vk.clip(80,100) # all shorter tha 80 value replace with 80 and greater value replace with 100

match_no
1      80
2      80
3      80
4      80
5      80
       ..
211    80
212    80
213    80
214    80
215    80
Name: runs, Length: 215, dtype: int64

## .duplicates()

In [112]:
vk.duplicated() # check duplicates

match_no
1      False
2      False
3      False
4      False
5       True
       ...  
211     True
212     True
213     True
214     True
215     True
Name: runs, Length: 215, dtype: bool

In [116]:
vk[vk.duplicated()]

match_no
5       1
13      3
22     38
23     19
26      2
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 137, dtype: int64

## .drop_duplicates()

In [120]:
vk.drop_duplicates()

match_no
1       1
2      23
3      13
4      12
6       9
       ..
173    29
175    72
176    43
178    90
198    53
Name: runs, Length: 78, dtype: int64

In [122]:
vk.drop_duplicates(keep="last") # last occerence

match_no
26      2
37     37
41     71
49     15
58     22
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 78, dtype: int64

## .isnull()

In [124]:
import numpy as np
temp = pd.Series([1,2,3,np.nan,5,6,np.nan,7])
temp

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
6    NaN
7    7.0
dtype: float64

In [125]:
temp.count() # count() function are not count the nan value

6

In [127]:
temp.size # .size count the no of total rows

8

In [128]:
temp.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7    False
dtype: bool

In [130]:
temp[temp.isnull()]

3   NaN
6   NaN
dtype: float64

## dropna()

In [131]:
temp.dropna()

0    1.0
1    2.0
2    3.0
4    5.0
5    6.0
7    7.0
dtype: float64

- ## fillna()

In [132]:
temp

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
6    NaN
7    7.0
dtype: float64

In [133]:
temp.fillna(0) # replace missing nan value with 0

0    1.0
1    2.0
2    3.0
3    0.0
4    5.0
5    6.0
6    0.0
7    7.0
dtype: float64

In [135]:
temp.fillna(temp.mean()) # replace with mean

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    4.0
7    7.0
dtype: float64

## .isin()

In [136]:
vk

match_no
1       1
2      23
3      13
4      12
5       1
       ..
211     0
212    20
213    73
214    25
215     7
Name: runs, Length: 215, dtype: int64

In [137]:
# check 
vk[(vk == 49) | (vk == 52)]

match_no
86     49
122    52
Name: runs, dtype: int64

In [138]:
vk.isin([49,52]) # same way to use multiple logic

match_no
1      False
2      False
3      False
4      False
5      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: runs, Length: 215, dtype: bool

In [139]:
vk[vk.isin([49,52])]

match_no
86     49
122    52
Name: runs, dtype: int64

# .apply()

In [140]:
movies

movie
Uri: The Surgical Strike                   Vicky Kaushal
Battalion 609                                Vicky Ahuja
The Accidental Prime Minister (film)         Anupam Kher
Why Cheat India                            Emraan Hashmi
Evening Shadows                         Mona Ambegaonkar
                                              ...       
Hum Tumhare Hain Sanam                    Shah Rukh Khan
Aankhen (2002 film)                     Amitabh Bachchan
Saathiya (film)                             Vivek Oberoi
Company (film)                                Ajay Devgn
Awara Paagal Deewana                        Akshay Kumar
Name: lead, Length: 1500, dtype: object

In [144]:
# apply is for apply the custom logic
movies.apply(lambda x : x.split())

movie
Uri: The Surgical Strike                   [Vicky, Kaushal]
Battalion 609                                [Vicky, Ahuja]
The Accidental Prime Minister (film)         [Anupam, Kher]
Why Cheat India                            [Emraan, Hashmi]
Evening Shadows                         [Mona, Ambegaonkar]
                                               ...         
Hum Tumhare Hain Sanam                   [Shah, Rukh, Khan]
Aankhen (2002 film)                     [Amitabh, Bachchan]
Saathiya (film)                             [Vivek, Oberoi]
Company (film)                                [Ajay, Devgn]
Awara Paagal Deewana                        [Akshay, Kumar]
Name: lead, Length: 1500, dtype: object

In [145]:
movies.apply(lambda x : x.split()[0])

movie
Uri: The Surgical Strike                  Vicky
Battalion 609                             Vicky
The Accidental Prime Minister (film)     Anupam
Why Cheat India                          Emraan
Evening Shadows                            Mona
                                         ...   
Hum Tumhare Hain Sanam                     Shah
Aankhen (2002 film)                     Amitabh
Saathiya (film)                           Vivek
Company (film)                             Ajay
Awara Paagal Deewana                     Akshay
Name: lead, Length: 1500, dtype: object

In [146]:
movies.apply(lambda x : x.split()[0].upper())

movie
Uri: The Surgical Strike                  VICKY
Battalion 609                             VICKY
The Accidental Prime Minister (film)     ANUPAM
Why Cheat India                          EMRAAN
Evening Shadows                            MONA
                                         ...   
Hum Tumhare Hain Sanam                     SHAH
Aankhen (2002 film)                     AMITABH
Saathiya (film)                           VIVEK
Company (film)                             AJAY
Awara Paagal Deewana                     AKSHAY
Name: lead, Length: 1500, dtype: object

In [149]:
def fname(x):
    return x.split()[0].upper()

In [150]:
movies.apply(fname)

movie
Uri: The Surgical Strike                  VICKY
Battalion 609                             VICKY
The Accidental Prime Minister (film)     ANUPAM
Why Cheat India                          EMRAAN
Evening Shadows                            MONA
                                         ...   
Hum Tumhare Hain Sanam                     SHAH
Aankhen (2002 film)                     AMITABH
Saathiya (film)                           VIVEK
Company (film)                             AJAY
Awara Paagal Deewana                     AKSHAY
Name: lead, Length: 1500, dtype: object