# Working with Text Data


In [3]:
import pandas as pd

## This Module's Dataset
- This module's dataset (`chicago.csv`) is a collection of public sector employees in the city of Chicago
- Each row include's the employee's name, position, department and salary.

In [13]:
chicago_df = pd.read_csv("chicago.csv").dropna(how="all")
chicago_df['Department'] = chicago_df['Department'].astype("category")
chicago_df.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [15]:
chicago_df.info

<bound method DataFrame.info of                       Name                  Position Title        Department  \
0          AARON,  ELVIA J                WATER RATE TAKER       WATER MGMNT   
1        AARON,  JEFFERY M                  POLICE OFFICER            POLICE   
2           AARON,  KARINA                  POLICE OFFICER            POLICE   
3      AARON,  KIMBERLEI R        CHIEF CONTRACT EXPEDITER  GENERAL SERVICES   
4      ABAD JR,  VICENTE M               CIVIL ENGINEER IV       WATER MGMNT   
...                    ...                             ...               ...   
32057  ZYGADLO,  MICHAEL J  FRM OF MACHINISTS - AUTOMOTIVE  GENERAL SERVICES   
32058   ZYGOWICZ,  PETER J                  POLICE OFFICER            POLICE   
32059    ZYMANTAS,  MARK E                  POLICE OFFICER            POLICE   
32060  ZYRKOWSKI,  CARLO E                  POLICE OFFICER            POLICE   
32061  ZYSKOWSKI,  DARIUSZ         CHIEF DATA BASE ANALYST              DoIT   

      E

## Common String Methods

In [16]:
chicago = chicago_df.copy()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [18]:
chicago['Name'].str.title()

0            Aaron,  Elvia J
1          Aaron,  Jeffery M
2             Aaron,  Karina
3        Aaron,  Kimberlei R
4        Abad Jr,  Vicente M
                ...         
32057    Zygadlo,  Michael J
32058     Zygowicz,  Peter J
32059      Zymantas,  Mark E
32060    Zyrkowski,  Carlo E
32061    Zyskowski,  Dariusz
Name: Name, Length: 32062, dtype: object

In [19]:
chicago['Name'].str.title().str.len() # since `str.title` function returns a series of string the `str` prop has to be used again

0        15
1        17
2        14
3        19
4        19
         ..
32057    19
32058    18
32059    17
32060    19
32061    19
Name: Name, Length: 32062, dtype: int64

## Filtering with String Methods
- `contains`
- `startswith`
- `endswith`

In [21]:
chicago = chicago_df.copy()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [23]:
chicago[chicago["Name"].str.contains("AARON")]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
63,"ACEVEDO, AARON F",POLICE OFFICER,POLICE,$81588.00
952,"ARMSTEAD, AARON P",POLICE OFFICER,POLICE,$84450.00
1716,"BEALS JR, AARON",LIEUTENANT-PARAMEDIC,FIRE,$121068.00
3200,"BRYANT, AARON S",LIBRARY PAGE,PUBLIC LIBRARY,$12844.00
3270,"BUCKLEY, AARON D",PARAMEDIC,FIRE,$81588.00
3416,"BURNS, AARON",POLICE OFFICER,POLICE,$74028.00


In [25]:
chicago[chicago ["Position Title"].str.lower().str.contains("water")]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


## Str Methods on Index and Columns

In [29]:
chicago = chicago_df.copy()
chicago = chicago.set_index("Name").sort_index()
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [32]:
chicago[chicago.index.str.contains('JEFFERY')]

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
"BARBER, JEFFERY P",POLICE OFFICER,POLICE,$90618.00
"BLACK, JEFFERY L",POLICE OFFICER,POLICE,$87384.00
"BROSSEAU, JEFFERY J",POLICE OFFICER,POLICE,$90618.00
"BURKS, JEFFERY G",POLICE OFFICER,POLICE,$81588.00
"GABRIEL, JEFFERY S",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"GALLOWAY, JEFFERY M",POLICE OFFICER,POLICE,$90618.00
"HADDON, JEFFERY A",POLICE OFFICER,POLICE,$87384.00
"HAVELKA, JEFFERY S",POLICE OFFICER,POLICE,$84450.00
"HAZLE, JEFFERY S",POOL MOTOR TRUCK DRIVER,AVIATION,$72862.40


In [33]:
chicago.index = chicago.index.str.strip().str.title()

In [34]:
chicago.head()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [35]:
chicago.columns = chicago.columns.str.upper()

In [36]:
chicago.head()

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00
"Aaron, Kimberlei R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
"Abad Jr, Vicente M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


## The split Method

In [37]:
chicago = chicago_df.copy()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [46]:
chicago['Position Title'].str.split(" ").str.get(0).value_counts()

Position Title
POLICE                10856
FIREFIGHTER-EMT        1509
SERGEANT               1186
POOL                    918
FIREFIGHTER             810
                      ...  
DEVELOPMENT               1
PURCHASING                1
MOBILE                    1
ANALYST                   1
TELECOMMUNICATIONS        1
Name: count, Length: 320, dtype: int64

In [57]:
chicago['Name'].str.title().str.split(', ').str.get(1).str.strip().str.split(' ').str.get(0).value_counts()

Name
Michael    1153
John        899
James       676
Robert      622
Joseph      537
           ... 
Samera        1
Kefeng        1
Edina         1
Tecla         1
Lilya         1
Name: count, Length: 5091, dtype: int64

## The expand and n Parameters of the split Method
- The `expand` parameter returns a **DataFrame** instead of a **Series** of lists.
- The `n` parmeter limits the number of splits.

In [59]:
chicago = chicago_df.copy()
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [66]:
chicago[['LastName', 'FirstName']]=chicago["Name"].str.split(",", expand=True) # expands into df

In [67]:
chicago

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,FistName,LastName,FirstName
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,ABAD JR,VICENTE M
...,...,...,...,...,...,...,...
32057,"ZYGADLO, MICHAEL J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00,ZYGADLO,ZYGADLO,MICHAEL J
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00,ZYGOWICZ,ZYGOWICZ,PETER J
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00,ZYMANTAS,ZYMANTAS,MARK E
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00,ZYRKOWSKI,ZYRKOWSKI,CARLO E


In [70]:
chicago["Position Title"].str.split(' ', expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
...,...,...,...,...,...,...,...,...,...
32057,FRM,OF,MACHINISTS,-,AUTOMOTIVE,,,,
32058,POLICE,OFFICER,,,,,,,
32059,POLICE,OFFICER,,,,,,,
32060,POLICE,OFFICER,,,,,,,


In [72]:
chicago["Position Title"].str.split(' ', expand=True, n=2) ## Number of splits, Split n times then take remainder and put in last column of df

Unnamed: 0,0,1,2
0,WATER,RATE,TAKER
1,POLICE,OFFICER,
2,POLICE,OFFICER,
3,CHIEF,CONTRACT,EXPEDITER
4,CIVIL,ENGINEER,IV
...,...,...,...
32057,FRM,OF,MACHINISTS - AUTOMOTIVE
32058,POLICE,OFFICER,
32059,POLICE,OFFICER,
32060,POLICE,OFFICER,
