In [1]:
import pandas as pd

# Working with Text Data:
# Intro (section 7.101):

- issues with text data: empty white space in the start or end of a string, wrong capitalization, mutiple values in the same cell, etc
- pandas allows text data to be better formatted for analysis
- chicago.csv stores all public employees in the city of Chicago

In [6]:
chi = pd.read_csv("chicago.csv")
chi["Department"] = chi["Department"].astype("category")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [7]:
chi.info()
# department column is a good candidate to convert to a category

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [9]:
chi["Department"].nunique()
# can also call .nunique() on the whole df
chi.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [13]:
chi["Department"] = chi["Department"].astype("category")
chi.info()
# memory usage has decreased! 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Name                    32062 non-null  object  
 1   Position Title          32062 non-null  object  
 2   Department              32062 non-null  category
 3   Employee Annual Salary  32062 non-null  object  
dtypes: category(1), object(3)
memory usage: 784.2+ KB


# Common String Methods (section 7.102):
## .lower(), .upper(), .title(), and .len():

In [17]:
len("Hello World")
# spaces count
"HELLO WORLD".title()

chi = pd.read_csv("chicago.csv")
chi["Department"] = chi["Department"].astype("category")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [21]:
# cant do chi["Name"].title(), need .str
chi["Name"] = chi["Name"].str.title()
chi["Position Title"] = chi["Position Title"].str.title()
chi["Department"] = chi["Department"].str.title()
chi.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,$90744.00
1,"Aaron, Jeffery M",Police Officer,Police,$84450.00
2,"Aaron, Karina",Police Officer,Police,$84450.00
3,"Aaron, Kimberlei R",Chief Contract Expediter,General Services,$89880.00
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Mgmnt,$106836.00


In [27]:
len(chi["Name"])
# ^ returns the number of rows
chi["Name"].str.len()
# ^ returns the number of characters in every value
# when working with a series in pandas, len is an available method not just a built in python function

0        15.0
1        17.0
2        14.0
3        19.0
4        19.0
         ... 
32058    18.0
32059    17.0
32060    19.0
32061    19.0
32062     NaN
Name: Name, Length: 32063, dtype: float64

# The .str.replace() method (section 7.103):

- .str.replace() replaces all occurences of a character with another

In [31]:
chi = pd.read_csv("chicago.csv").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [29]:
"Hello world".replace("w","B" )
# first arguement: what you're replacing
# second argumenet: what you're replacing with

'Hello Borld'

In [34]:
chi["Department"] = chi["Department"].str.replace("MGMNT", "MANAGEMENT")
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [38]:
# python can't do math on strings
chi["Employee Annual Salary"] = chi["Employee Annual Salary"].str.replace("$","").astype(float)
chi.head()

  chi["Employee Annual Salary"] = chi["Employee Annual Salary"].str.replace("$","").astype(float)


Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [39]:
chi["Employee Annual Salary"].sum()

2571506375.36

In [40]:
chi["Employee Annual Salary"].mean()

80204.178633899

In [41]:
chi["Employee Annual Salary"].nlargest(3)

8184     300000.0
7954     216210.0
25532    202728.0
Name: Employee Annual Salary, dtype: float64

# Filtering with String Methods (section 7.104):

- need to generate a boolean series in order to filter
- str.contains(), str.startswith(), str.endswith()

In [42]:
chi = pd.read_csv("chicago.csv").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [45]:
mask = chi["Position Title"].str.lower().str.contains("water")
# good practice to normalize the strings whene searching through strings
chi[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
...,...,...,...,...
29669,"VERMA, ANUPAM",MANAGING ENGINEER - WATER MANAGEMENT,WATER MGMNT,$111192.00
30239,"WASHINGTON, JOSEPH",WATER CHEMIST III,WATER MGMNT,$89676.00
30544,"WEST, THOMAS R",GEN SUPT OF WATER MANAGEMENT,WATER MGMNT,$115704.00
30991,"WILLIAMS, MATTHEW",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00


In [47]:
mask = chi["Position Title"].str.lower().str.startswith("water")
chi[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
...,...,...,...,...
28574,"THREATT, DENISE R",WATER QUALITY INSPECTOR,WATER MGMNT,$62004.00
28602,"TIGNOR, DARRYL B",WATER RATE TAKER,WATER MGMNT,$78948.00
28955,"TRAVIS COOK, LESLIE R",WATER RATE TAKER,WATER MGMNT,$78948.00
29584,"VELAZQUEZ, JOHN",WATER RATE TAKER,WATER MGMNT,$78948.00


In [50]:
mask = chi["Position Title"].str.lower().str.endswith("ist")
chi[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
...,...,...,...,...
31667,"YODER, TERESA G",ARCHIVAL SPECIALIST,PUBLIC LIBRARY,$74304.00
31688,"YOUNGBLOOM, LAURENCE G",CRIMES SURVEILLANCE SPECIALIST,OEMC,$19676.80
31717,"YOUNG, KIMBERLY M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$68556.00
31837,"ZAPATA, HUGO",SR PROCUREMENT SPECIALIST,PROCUREMENT,$87324.00


# More String Methods (section 7.105):
## .strip(), .lstrip(), and .rstrip()

- these methods remove whitespace from a string
- need to start them with .str

In [51]:
chi = pd.read_csv("chicago.csv").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [52]:
"        Hello World    ".lstrip()

'Hello World    '

In [53]:
"        Hello World    ".rstrip()

'        Hello World'

In [54]:
"        Hello World    ".strip()

'Hello World'

In [60]:
chi["Name"] = chi["Name"].str.strip()
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


In [63]:
chi["Position Title"] = chi["Position Title"].str.strip()
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00


# String Methods on Index and Column lables (section 7.106):


In [65]:
chi = pd.read_csv("chicago.csv", index_col = "Name").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)
# the names column is now the index

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [70]:
chi.index = chi.index.str.strip().str.title()
chi.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [73]:
chi.columns = chi.columns.str.upper()
chi.tail(3)

Unnamed: 0_level_0,POSITION TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


# Split strings by Characters with .str.split() method (section 7.107):

- creates a list with all of the separated components of the string
- can give split an arguemnt to split by different characters

In [74]:
chi = pd.read_csv("chicago.csv").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [79]:
"Hello my name is kyle".split()

['Hello', 'my', 'name', 'is', 'kyle']

In [81]:
chi["Name"].str.split(",").str.get()
# the comma in the outputted list is the comma that separates the list values, not the comma from the string

0            [AARON,   ELVIA J]
1          [AARON,   JEFFERY M]
2             [AARON,   KARINA]
3        [AARON,   KIMBERLEI R]
4        [ABAD JR,   VICENTE M]
                  ...          
32057    [ZYGADLO,   MICHAEL J]
32058     [ZYGOWICZ,   PETER J]
32059      [ZYMANTAS,   MARK E]
32060    [ZYRKOWSKI,   CARLO E]
32061    [ZYSKOWSKI,   DARIUSZ]
Name: Name, Length: 32062, dtype: object

In [84]:
chi["Name"].str.split(",").str.get(0).str.title().value_counts()
# .get() is given the index of the item from each list that you want (the last name is at index 0 in each list)

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
            ... 
Horkavy        1
Horn           1
Horne Jr       1
Horner         1
Zyskowski      1
Name: Name, Length: 13829, dtype: int64

In [91]:
chi["Position Title"].str.split().str.get(0).str.title().value_counts()

Police             10856
Firefighter-Emt     1509
Sergeant            1186
Pool                 918
Firefighter          810
                   ...  
Dentist                1
Assoc                  1
Telephone              1
Mayor                  1
Prepress               1
Name: Position Title, Length: 320, dtype: int64

# More Practice with the .split() method (section 7.108):

In [92]:
chi = pd.read_csv("chicago.csv").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [98]:
chi["Name"].str.split(",").str.get(1).str.strip().str.split().str.get(0).str.title().value_counts().head(5)

Michael    1153
John        899
James       676
Robert      622
Joseph      537
Name: Name, dtype: int64

# The expand and n parameters of the str.split() method (section 7.109):

- expand: set to false by default, returns a df when set to true
- n: the max number of columns that .split() will create, n = 1 means one total split (2 columns created)

In [99]:
chi = pd.read_csv("chicago.csv").dropna(how = "all")
chi["Department"] = chi["Department"].astype("category")
chi.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [116]:
chi[["First Name", "Last Name"]] = chi["Name"].str.split(",", expand = True)
# adds the new columns to the right of the existing df
chi.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [121]:
chi[["First Title Word", "Rest of Title"]] = chi["Position Title"].str.split(" ", expand = True, n = 1)
chi.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name,First Title Word,Rest of Title
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J,WATER,RATE TAKER
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M,POLICE,OFFICER
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA,POLICE,OFFICER
