In [31]:
import pandas as pd

chicago = pd.read_csv('Datasets/chicago.csv')
chicago['Department'] = chicago['Department'].astype('category')
chicago.head(10)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
5,"ABARCA, ANABEL",ASST TO THE ALDERMAN,CITY COUNCIL,$70764.00
6,"ABARCA, EMMANUEL",GENERAL LABORER - DSS,STREETS & SAN,$41849.60
7,"ABASCAL, REECE E",TRAFFIC CONTROL AIDE-HOURLY,OEMC,$20051.20
8,"ABBASI, CHRISTOPHER",STAFF ASST TO THE ALDERMAN,CITY COUNCIL,$49452.00
9,"ABBATACOLA, ROBERT J",ELECTRICAL MECHANIC,AVIATION,$93600.00


In [7]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.1+ KB


In [8]:
# use df.nunique() to identify columns suitable for category data type.

chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [2]:
# converting to category data type.

chicago['Department'] = chicago['Department'].astype('category')

In [3]:
chicago.info()

# After turning Department column to category type, size is significantly reduced.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


# # S.str methods

#### S.str methods are used to appply python string operations on column values.
#### one can apply string methods only through S.str

In [22]:
# Extracting Department column as Series.

chicago['Department']

0             WATER MGMNT
1                  POLICE
2                  POLICE
3        GENERAL SERVICES
4             WATER MGMNT
               ...       
32058              POLICE
32059              POLICE
32060              POLICE
32061                DoIT
32062                 NaN
Name: Department, Length: 32063, dtype: category
Categories (35, object): [ADMIN HEARNG, ANIMAL CONTRL, AVIATION, BOARD OF ELECTION, ..., STREETS & SAN, TRANSPORTN, TREASURER, WATER MGMNT]

In [25]:
# applying .title() string method to all values in Department column.
# str.title() capitalizes first alphabet of every word in string.

chicago['Department'].str.title()

0             Water Mgmnt
1                  Police
2                  Police
3        General Services
4             Water Mgmnt
               ...       
32058              Police
32059              Police
32060              Police
32061                Doit
32062                 NaN
Name: Department, Length: 32063, dtype: object

In [26]:
# capitalizing 'Department' column values

chicago['Department'] = chicago['Department'].str.title()

In [28]:
# capitalizing 'Position Title' column values

chicago['Position Title'] = chicago['Position Title'].str.title()

In [30]:
# gives string length of each value in the column

chicago['Name'].str.len()

0        15.0
1        17.0
2        14.0
3        19.0
4        19.0
         ... 
32058    18.0
32059    17.0
32060    19.0
32061    19.0
32062     NaN
Name: Name, Length: 32063, dtype: float64

### # S.str.replace() method

#### S.str.replace() takes two arguments, what to replace, and with what. To apply .replace use it after .str (.str.replace()).

In [35]:
# Replacing 'MGMNT' with 'MANAGEMENT' in every value of column 'Department'

chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')

In [40]:
# Converting employee salary to float

chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype('float32')

### S.str.contains() method

In [66]:
# Dropping rows with all NaN values.

chicago.dropna(how='all', inplace=True)

#### # While creating a boolean series on a column containing strings using .str() method, its adviced to first convert values to upper or lower case for more consistency.

In [69]:
# Creating a boolean series, True if string contains 'water'.
mask = chicago['Position Title'].str.lower().str.contains('water')

# extracting the values through boolean series.
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MANAGEMENT,82044.0
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MANAGEMENT,109272.0
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MANAGEMENT,111192.0
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MANAGEMENT,89676.0
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MANAGEMENT,115704.0
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MANAGEMENT,102440.0


### # S.str.startswith() method

#### S.startswith() takes a string and returns True if a string in column start with the pased string.

In [72]:
# True for all the Position titles that start with 'water'

chicago['Position Title'].str.lower().str.startswith('water')

0         True
1        False
2        False
3        False
4        False
         ...  
32057    False
32058    False
32059    False
32060    False
32061    False
Name: Position Title, Length: 32062, dtype: bool

### # S.str.endswith() method

#### S.str.endswith() takes a string and returns True if the column string ends with passed string.

In [74]:
# True for all the Position titles that end with 'water'

chicago['Position Title'].str.lower().str.endswith('water')

0        False
1        False
2        False
3        False
4        False
         ...  
32057    False
32058    False
32059    False
32060    False
32061    False
Name: Position Title, Length: 32062, dtype: bool

## # S.str.strip() methods

#### S.str().lstrip() strips blank spaces on the left side of the string.
#### S.str().rstrip() strips blank spaces on the right side of the string.
#### S.str().strip() strips blank spaces on either sides of the string.

In [76]:
chicago['Name'] = chicago['Name'].str.strip()

0            AARON,  ELVIA J
1          AARON,  JEFFERY M
2             AARON,  KARINA
3        AARON,  KIMBERLEI R
4        ABAD JR,  VICENTE M
                ...         
32057    ZYGADLO,  MICHAEL J
32058     ZYGOWICZ,  PETER J
32059      ZYMANTAS,  MARK E
32060    ZYRKOWSKI,  CARLO E
32061    ZYSKOWSKI,  DARIUSZ
Name: Name, Length: 32062, dtype: object

In [77]:
# striping blank spaces from either sides through method chaining.

chicago['Position Title'] = chicago['Position Title'].str.lstrip().str.rstrip()

## # applying S.str methods to index and columns

In [87]:
# setting "Name" column as index.

chicago.set_index(keys='Name', inplace=True)

In [91]:
# striping blank spaces from row-header and converting to title.
chicago.index.str.strip().str.title()

# assigning the new series to dataframe index
chicago.index = chicago.index.str.strip().str.title()
chicago.head(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
"Aaron, Jeffery M",POLICE OFFICER,POLICE,84450.0
"Aaron, Karina",POLICE OFFICER,POLICE,84450.0


In [94]:
# converting column headings to uppercase.

chicago.columns = chicago.columns.str.upper()

In [95]:
chicago.head(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
"Aaron, Jeffery M",POLICE OFFICER,POLICE,84450.0
"Aaron, Karina",POLICE OFFICER,POLICE,84450.0


# # S.str.split() method

In [98]:
# setting things up
chicago.reset_index(inplace=True)

In [103]:
# Accessing last names of the people

chicago['Name'].str.split(',').str.get(0)

0            Aaron
1            Aaron
2            Aaron
3            Aaron
4          Abad Jr
           ...    
32057      Zygadlo
32058     Zygowicz
32059     Zymantas
32060    Zyrkowski
32061    Zyskowski
Name: Name, Length: 32062, dtype: object

In [115]:
# Accessing the first names of the people

first_n = chicago['Name'].str.split(',').str.get(1).str.split().str.get(0)

In [116]:
# Counting occurrences of names.

first_n.value_counts()

Michael     1153
John         899
James        676
Robert       622
Joseph       537
            ... 
Shelia         1
Ashanti        1
Shadi          1
Deondria       1
Donita         1
Name: Name, Length: 5091, dtype: int64

## Parameters of .str.split() method

#### # The 'expand' kwarg makes the method return a dataframe insted of a series. Which can then be merged into the main dataframe.

In [122]:
chicago['Name'].str.split(',', expand=True)

Unnamed: 0,0,1
0,Aaron,Elvia J
1,Aaron,Jeffery M
2,Aaron,Karina
3,Aaron,Kimberlei R
4,Abad Jr,Vicente M
...,...,...
32057,Zygadlo,Michael J
32058,Zygowicz,Peter J
32059,Zymantas,Mark E
32060,Zyrkowski,Carlo E


#### # The 'n' parameter defines the number of times the split is executed, ie. n=2 will only split the string till 2 consequtive charecters.

In [124]:
# Here, the first names and middle name initials are not split.
# as 'n' is set to 1.

chicago['Name'].str.split(' ', expand=True, n=1)

Unnamed: 0,0,1
0,"Aaron,",Elvia J
1,"Aaron,",Jeffery M
2,"Aaron,",Karina
3,"Aaron,",Kimberlei R
4,Abad,"Jr, Vicente M"
...,...,...
32057,"Zygadlo,",Michael J
32058,"Zygowicz,",Peter J
32059,"Zymantas,",Mark E
32060,"Zyrkowski,",Carlo E
