In [28]:
import pandas as pd

In [29]:
chicago = pd.read_csv("chicago.csv", dtype = {"Department" : "category"}).dropna(how='all')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


## .str prefix in pandas
- normal python string methods will not work on pandas series unless prefixed with str class. 

In [30]:
#This is going to generate an error
chicago['Name'].title()

AttributeError: 'Series' object has no attribute 'title'

In [31]:
#This is going to work fine
#Similarly other methods will work:- lower(), upper(), strip(),lstrip(), rstrip(), len()
chicago['Name'] = chicago['Name'].str.title()

In [32]:
#To get the length of each element in Deparment column, use below
chicago['Department'].str.len()

0        11
1         6
2         6
3        16
4        11
5        12
6        13
7         4
8        12
9         8
10        4
11        6
12       16
13        6
14        4
15        6
16        6
17        4
18        6
19        4
20       11
21        4
22        4
23        4
24       16
25       11
26       13
27       16
28        6
29        4
         ..
32032    16
32033     6
32034     6
32035     6
32036    13
32037     4
32038    11
32039     4
32040     6
32041     4
32042    16
32043    13
32044     6
32045     4
32046     7
32047     6
32048     3
32049     4
32050    11
32051     8
32052     6
32053     4
32054     6
32055     6
32056    16
32057    16
32058     6
32059     6
32060     6
32061     4
Name: Department, Length: 32062, dtype: int64

In [33]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
2,"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


## .str.replace method

In [34]:
#Let's change all the MGMNT in departments to MANAGEMENT
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')

#Let's also remove the $ sign from Salary and change the column to float
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)

In [35]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"Aaron, Jeffery M",POLICE OFFICER,POLICE,84450.0
2,"Aaron, Karina",POLICE OFFICER,POLICE,84450.0


## Filter a dataframe rows using a string methods
- use of following methods: .contains(), .startswith(), .endswith()

In [37]:
chicago = pd.read_csv("chicago.csv", dtype = {"Department" : "category"}).dropna(how='all')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [53]:
#It's always good to normalize the data first before searching something in string. Eg: convert to lower first.
mask = chicago['Position Title'].str.lower().str.contains("water")
chicago[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00


In [52]:
#Find all the Positions which start with water
mask = chicago['Position Title'].str.lower().str.startswith("water")
chicago[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00


In [54]:
#Find all the Positions which ends with "ist"
mask = chicago['Position Title'].str.lower().str.endswith("ist")
chicago[mask].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00


## Stripping the spaces

In [55]:
chicago = pd.read_csv("chicago.csv", dtype = {"Department" : "category"}).dropna(how='all')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [58]:
chicago['Name'].str.lstrip().str.rstrip().head(3)

0      AARON,  ELVIA J
1    AARON,  JEFFERY M
2       AARON,  KARINA
Name: Name, dtype: object

In [59]:
chicago['Name'].str.strip().head(3)

0      AARON,  ELVIA J
1    AARON,  JEFFERY M
2       AARON,  KARINA
Name: Name, dtype: object

## Invoke String methods on DF Index and Columns

In [62]:
chicago = pd.read_csv("chicago.csv", index_col = "Name",
                      dtype = {"Department" : "category"}).dropna(how='all')
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [66]:
chicago.index = chicago.index.str.title()

In [67]:
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [70]:
chicago.columns = chicago.columns.str.upper()

In [71]:
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


## Split String by Characters with .str.split() Method

In [72]:
chicago = pd.read_csv("chicago.csv",
                      dtype = {"Department" : "category"}).dropna(how='all')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [78]:
#Let's replace the Name column with First names only
chicago['Name'] = chicago['Name'].str.split(',').str.get(0).str.title()

In [79]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,Aaron,WATER RATE TAKER,WATER MGMNT,$90744.00
1,Aaron,POLICE OFFICER,POLICE,$84450.00
2,Aaron,POLICE OFFICER,POLICE,$84450.00


In [80]:
#Let's see which words appear the most from Position Title

In [81]:
chicago["Position Title"].str.split(" ").str.get(0).value_counts()

POLICE                   10856
FIREFIGHTER-EMT           1509
SERGEANT                  1186
POOL                       918
FIREFIGHTER                810
CROSSING                   775
MOTOR                      721
SANITATION                 715
PARAMEDIC                  641
ASST                       606
FIRE                       512
TRAFFIC                    512
SENIOR                     470
CONSTRUCTION               452
LIEUTENANT-EMT             394
ADMINISTRATIVE             375
LIBRARY                    365
LIBRARIAN                  335
LIEUTENANT                 332
OPERATING                  324
ELECTRICAL                 313
AVIATION                   309
FIREFIGHTER/PARAMEDIC      259
GENERAL                    257
STAFF                      250
CLERK                      242
FOREMAN                    237
HOISTING                   214
DEPUTY                     213
MACHINIST                  210
                         ...  
INSPECTOR                    1
VOLUNTEE

## Exploring 'expand' and 'n' parameters of split method

In [82]:
chicago = pd.read_csv("chicago.csv",
                      dtype = {"Department" : "category"}).dropna(how='all')
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [83]:
#expand can be used to create a dataframe from splitted values
chicago['Name'].str.split(",").head(3)

0      [AARON,   ELVIA J]
1    [AARON,   JEFFERY M]
2       [AARON,   KARINA]
Name: Name, dtype: object

In [84]:
chicago['Name'].str.split(",", expand = True).head(3)

Unnamed: 0,0,1
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA


In [85]:
#Let's copy this DF into original Df
chicago[['First Name', 'Second Name']] = chicago['Name'].str.split(",", expand = True)

In [86]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Second Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [91]:
#Let's see the use of 'n' parameter, It is basically used to tell that how many splits needs to be done.

#Below code shows that there are too many spaces and split is not uniform
chicago['Position Title'].str.split(" ", expand=True).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,


In [92]:
#Let's limit the number of splits
chicago['Position Title'].str.split(" ", expand=True, n =1).head()

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV


In [93]:
chicago[['First Position Word', 'Remaining Words']] = chicago['Position Title'].str.split(" ", expand=True, n =1)

In [95]:
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Second Name,First Position Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
