In [2]:
import pandas as pd

### Intro to the Working with Text Data Module

In [3]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [7]:
chicago.get_dtype_counts()
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [13]:
## Make the data more efficient and less memory
chicago['Department'].nunique()
chicago['Department'].count()

32062

In [4]:
chicago['Department'] = chicago['Department'].astype('category')

In [15]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


### Common String Methods - .lower(), .upper(), .title() and .len()

In [19]:
## review those functions on single string
## .lower() function converts all characters in a string to lowercase, .upper() function does the reverse thing
'HELLO WORLD'.lower()
'hello world'.upper()

'HELLO WORLD'

In [24]:
## .title() function capitalize the first letter of every word(recognized by spaces)
'HELLO WORLD'.title()
'hello world'.title()

'Hello World'

In [22]:
## len() Python built-in function which tells us the number of characters including space
len('hello world')

11

In [38]:
##The syntax of these functions is a little different on series/columns
## chicago['Name'].lower()   -- This will give us an error, the syntax has to be .str.upper()/.str.lower()
chicago['Name'].str.lower().head(3)
chicago['Name'].str.lower().str.upper().head(3)

0      AARON,  ELVIA J
1    AARON,  JEFFERY M
2       AARON,  KARINA
Name: Name, dtype: object

In [46]:
chicago['Name'] = chicago['Name'].str.title()
chicago['Position Title'] = chicago['Position Title'].str.title()
chicago.head(5)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,WATER MGMNT,$90744.00
1,"Aaron, Jeffery M",Police Officer,POLICE,$84450.00
2,"Aaron, Karina",Police Officer,POLICE,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,GENERAL SERVICES,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,WATER MGMNT,$106836.00


In [44]:
len(chicago['Department'])   
## This will give me the number of records of this series in DataFrame including Null values, 32063 records

chicago['Department'].str.len().head(5)  
## This gives me the number of records in each category, but all rows will be shown

0    11.0
1     6.0
2     6.0
3    16.0
4    11.0
Name: Department, dtype: float64

### The .str.replace() Method

In [49]:
## Frist argument in .replace is going to look for, second argument is what we want to see in string
'Hello world'.replace("l", '!')

'He!!o wor!d'

In [54]:
chicago.dropna(how='all', inplace = True)
chicago.tail(1)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [55]:
chicago['Department'] = chicago['Department'].str.replace("MGMNT", "MANAGEMENT")

In [57]:
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace("$", "").astype("float")

In [59]:
chicago['Employee Annual Salary'].mean()
chicago['Employee Annual Salary'].nlargest(10)


8184     300000.0
7954     216210.0
25532    202728.0
8924     197736.0
8042     197724.0
19208    195000.0
3706     187680.0
18556    187680.0
29466    187680.0
13754    185364.0
Name: Employee Annual Salary, dtype: float64

### Filtering with String Methods

In [66]:
## Extract the rows meet the condition based on string filter
## .str.contains() takes a single argument and that is the substring we want to search for
chicago['Position Title'].str.lower().str.contains("water")   ## This gives a boolean series
chicago[chicago['Position Title'].str.lower().str.contains("water")]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00


In [69]:
## .str.startswith()  and .str.endswith() will check the begin or the end of the string
chicago[chicago['Position Title'].str.lower().str.startswith('water')].head(2)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00


In [71]:
chicago[chicago['Position Title'].str.lower().str.endswith('ist')].head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00


### More String Methods - .strip(), .lstrip(), and .rstrip()

In [5]:
## .strip() function family is used to remove whitespace from a string
## .strip() functions on basic strings
'        Hello World  '.lstrip()  ## remove the left whitespaces

'Hello World  '

In [6]:
'        Hello World  '.rstrip()  ## remove the right whitespaces

'        Hello World'

In [7]:
'        Hello World  '.strip()   ## remove whitespaces at the beginninga dn end of the string

'Hello World'

In [9]:
chicago['Name'].str.lstrip()
chicago['Name'].str.rstrip()
chicago['Name'] = chicago['Name'].str.lstrip().str.rstrip()

In [10]:
chicago['Position Title'] = chicago['Position Title'].str.strip()

### String Methods on Index Labels and Columns Labels

In [23]:
chicago = pd.read_csv('chicago.csv', index_col='Name').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')

In [16]:
chicago.index = chicago.index.str.strip().str.title()
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00
"Aaron, Jeffery M",POLICE OFFICER,POLICE,$84450.00
"Aaron, Karina",POLICE OFFICER,POLICE,$84450.00


In [22]:
chicago.columns = chicago.columns.str.upper()
chicago.head(1)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MGMNT,$90744.00


### Split Strings by Characters with .str.split() Method

In [24]:
## .split() function takes a single argument which represents what is called the delimiter or the separator
## .split() function: every time that it runs into the delimiter in the string, it is going to cut and return a list of 
## separate conponents of that string. Default delimitor is space
'Hello my name is Boris'.split()

['Hello', 'my', 'name', 'is', 'Boris']

In [32]:
chicago['Name'].str.split(',').head(2)   ## Get a two-item lists for every row

0      [AARON,   ELVIA J]
1    [AARON,   JEFFERY M]
Name: Name, dtype: object

In [38]:
## Pull the first item of each list of Name, we cannot use:
## chicago['Name'].str.split(',')[0], this will only give the first list, we need to use .get() function
chicago['Name'].str.split(',').str.get(0).str.title().value_counts()

Williams        293
Johnson         244
Smith           241
Brown           185
Jones           183
Rodriguez       171
Jackson         136
Garcia          130
Davis           127
Hernandez       110
Martinez        108
Lopez           106
Gonzalez        104
Perez           100
Wilson           94
Rivera           90
Thomas           89
Anderson         82
Torres           81
Murphy           80
Robinson         79
Moore            78
Sanchez          76
Harris           76
Miller           75
Lewis            74
Taylor           73
Martin           72
White            66
Clark            66
               ... 
Stanczak          1
Ambrosia          1
Byrne Iii         1
Givens Jr         1
Kriv              1
Faragalla         1
Economos          1
Felsenthal        1
Giovannielli      1
Mcgrone           1
Iskowitz          1
Maida Kohler      1
Errera            1
Napolillo         1
Baumhardt         1
Jamrok            1
Carrizal          1
Gricus            1
Gammal            1


In [49]:
chicago['Position Title'].str.split().str.get(0).value_counts().head(3)

POLICE             10856
FIREFIGHTER-EMT     1509
SERGEANT            1186
Name: Position Title, dtype: int64

## More Practice wth Splits

In [59]:
## We want to have the middle initial
chicago['Name'].str.split(', ').str.get(1).str.split()   ## one way
chicago['Name'].str.split(',').str.get(1).str.strip().str.split().str.get(-1).head(2)

0    J
1    M
Name: Name, dtype: object

In [61]:
## We want to have the first name
chicago['Name'].str.split(',').str.get(1).str.strip().str.split().str.get(0).head(2)
chicago['Name'].str.split(',').str.get(1).str.strip().str.split().str.get(0).value_counts().head()

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

### The expand and n Parametes of the .str.split() Method

In [65]:
## 1. expand =  parameter in split() function, default is set to False, if we set expand = True, it will return a dataframe
chicago[['First Name' , 'Last Name']] = chicago['Name'].str.split(',', expand = True)
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


In [69]:
## 2. n =  parameter in split() function, n represent the maximum number of splits that we want
chicago[['First Title Word', 'Remaining Words']] = chicago['Position Title'].str.split(expand = True, n = 1)
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Remaining Words
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
