# Working with text data

How to fix and optimize it

## Exploring the data

In [525]:
import pandas as pd

In [526]:
chicago = pd.read_csv('db/chicago.csv')
chicago

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00
...,...,...,...,...
32058,"ZYGOWICZ, PETER J",POLICE OFFICER,POLICE,$87384.00
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [527]:
chicago.describe()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
count,32062,32062,32062,32062
unique,31776,1093,35,1156
top,"HERNANDEZ, JUAN C",POLICE OFFICER,POLICE,$87384.00
freq,4,9184,12618,2394


In [528]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    32062 non-null  object
 1   Position Title          32062 non-null  object
 2   Department              32062 non-null  object
 3   Employee Annual Salary  32062 non-null  object
dtypes: object(4)
memory usage: 1002.1+ KB


In [529]:
chicago.isnull().values.any()

True

In [530]:
chicago.nunique()

Name                      31776
Position Title             1093
Department                   35
Employee Annual Salary     1156
dtype: int64

In [531]:
chicago['Department'].unique()

array(['WATER MGMNT', 'POLICE', 'GENERAL SERVICES', 'CITY COUNCIL',
       'STREETS & SAN', 'OEMC', 'AVIATION', 'FIRE', 'FAMILY & SUPPORT',
       'IPRA', 'PUBLIC LIBRARY', 'BUSINESS AFFAIRS', 'TRANSPORTN',
       'HEALTH', "MAYOR'S OFFICE", 'LAW', 'FINANCE', 'CULTURAL AFFAIRS',
       'COMMUNITY DEVELOPMENT', 'BUILDINGS', 'ANIMAL CONTRL',
       'CITY CLERK', 'BOARD OF ELECTION', 'INSPECTOR GEN', 'TREASURER',
       'DISABILITIES', 'HUMAN RESOURCES', 'DoIT', 'BUDGET & MGMT',
       'PROCUREMENT', 'HUMAN RELATIONS', 'BOARD OF ETHICS',
       'POLICE BOARD', 'ADMIN HEARNG', 'LICENSE APPL COMM', nan],
      dtype=object)

## Optimizing it

    Dropping null lines

In [532]:
chicago.dropna(how='all', inplace=True) # without the 'inplace=True' this would onlycreate a view and not change the dataframe

chicago.isnull().values.any()

False

    Renaming the columns

In [533]:
chicago.columns = ['Name', 'Position_Title', 'Department', 'Annual_Salary']
chicago.columns

Index(['Name', 'Position_Title', 'Department', 'Annual_Salary'], dtype='object')

    Changing data types and formating

In order to apply a string method to a Serie or df we need to call the 'str' method first

In [534]:
chicago['Position_Title']=chicago['Position_Title'].str.title()

In [535]:
chicago['Department']=chicago['Department'].str.title()
chicago['Department']=chicago['Department'].astype('category')

In [536]:
chicago['Name']=chicago['Name'].str.title() # Capitalizes the first letter of each word

    Removing the '$' symbol an converting to float

In [537]:
chicago["Annual_Salary"]=(chicago['Annual_Salary'].str.replace('$','')).astype(float)

  chicago["Annual_Salary"]=(chicago['Annual_Salary'].str.replace('$','')).astype(float)


In [538]:
chicago.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32062 entries, 0 to 32061
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Name            32062 non-null  object  
 1   Position_Title  32062 non-null  object  
 2   Department      32062 non-null  category
 3   Annual_Salary   32062 non-null  float64 
dtypes: category(1), float64(1), object(2)
memory usage: 1.0+ MB


In [539]:
chicago.describe()

Unnamed: 0,Annual_Salary
count,32062.0
mean,80204.178634
std,25098.329868
min,0.96
25%,72862.4
50%,84450.0
75%,93240.0
max,300000.0


### Checkpoint - 1

In [540]:
bkp1 = chicago.copy()

## Filtering with String methods

    .str.contains()

In [541]:
mask=chicago['Position_Title'].str.lower().str.contains('analyst')
chicago[mask].head(5)

Unnamed: 0,Name,Position_Title,Department,Annual_Salary
33,"Abouelkheir, Hassan A",Senior Programmer/Analyst,Family & Support,106836.0
240,"Agyekum, Kofi",Programmer/Analyst,Business Affairs,89676.0
253,"Ahmed, Khalid",Chief Programmer/Analyst,Water Mgmnt,113664.0
257,"Ahmed, Quazi S",Criminal History Analyst,Police,85764.0
258,"Ahmed, Rizwana P",Senior Research Analyst,Community Development,82044.0


    .str.startswith()

In [542]:
mask1=chicago['Department'].str.lower().str.startswith('water')
chicago[mask1]

Unnamed: 0,Name,Position_Title,Department,Annual_Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,90744.0
4,"Abad Jr, Vicente M",Civil Engineer Iv,Water Mgmnt,106836.0
20,"Abdul-Karim, Muhammad A",Engineering Technician Vi,Water Mgmnt,108228.0
25,"Abdulsattar, Mudhar",Civil Engineer Ii,Water Mgmnt,58536.0
34,"Abraham, Girley T",Civil Engineer Iv,Water Mgmnt,106836.0
...,...,...,...,...
31983,"Zivat, Michael",Construction Laborer,Water Mgmnt,81536.0
31984,"Zizumbo, Daniel",Pool Motor Truck Driver,Water Mgmnt,72862.4
32008,"Zotta, Sandino",Mechanical Engineer Iv,Water Mgmnt,106836.0
32038,"Zuno, Erik",Laborer - Apprentice,Water Mgmnt,73382.4


    .str.endswith()

In [543]:
mask2=chicago['Position_Title'].str.lower().str.endswith('ist')
chicago[mask2]

Unnamed: 0,Name,Position_Title,Department,Annual_Salary
184,"Afroz, Nayyar",Psychiatrist,Health,99840.0
308,"Alarcon, Luis J",Loan Processing Specialist,Community Development,81948.0
422,"Allain, Carolyn",Senior Telecommunications Specialist,Doit,89880.0
472,"Allen, Robert",Machinist,Water Mgmnt,94328.0
705,"Anderson, Edward M",Sr Procurement Specialist,Procurement,91476.0
...,...,...,...,...
31667,"Yoder, Teresa G",Archival Specialist,Public Library,74304.0
31688,"Youngbloom, Laurence G",Crimes Surveillance Specialist,Oemc,19676.8
31717,"Young, Kimberly M",Sr Procurement Specialist,Procurement,68556.0
31837,"Zapata, Hugo",Sr Procurement Specialist,Procurement,87324.0


    .str.strip()  ->both sides
    .str.lstrip() ->left
    .str.rstrip() ->right

These methods remove empty spaces, always usefull to ensure the data does not have additional spaces

In [544]:
'   Example   '.strip()
'   Example   '.lstrip()
'   Example   '.rstrip()

'   Example'

### String methods on indexes and columns

In [545]:
#Example
# you just need to call the index or column, add the .str and then the method you wish

chicago.set_index(keys='Name', inplace=True)
chicago.index=chicago.index.str.strip() # for this the 'title' would not work if our indexes were integers

chicago.columns=chicago.columns.str.title()

chicago.head(3)

Unnamed: 0_level_0,Position_Title,Department,Annual_Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,90744.0
"Aaron, Jeffery M",Police Officer,Police,84450.0
"Aaron, Karina",Police Officer,Police,84450.0


In [546]:
chicago.reset_index(inplace=True)
chicago.head(3)

Unnamed: 0,Name,Position_Title,Department,Annual_Salary
0,"Aaron, Elvia J",Water Rate Taker,Water Mgmnt,90744.0
1,"Aaron, Jeffery M",Police Officer,Police,84450.0
2,"Aaron, Karina",Police Officer,Police,84450.0


    .str.split() and .str.get()

In [547]:
'This is an example'.split(' ')  #expand=True will create a data frame with the obtained values

['This', 'is', 'an', 'example']

In [552]:
chicago['Position_Title'].str.split(' ',expand=True, #expand=True will create a data frame with the obtained values in the order they appear
                          n=1 # n lets you define the total number of columns (exceeding data will be concatenated to the last column)
                          ).head(5)
#you may directly assign them to new columns providind the names

Unnamed: 0,0,1
0,Water,Rate Taker
1,Police,Officer
2,Police,Officer
3,Chief,Contract Expediter
4,Civil,Engineer Iv


In [549]:
chicago['Surname']=chicago['Name'].str.split(', ').str.get(0) #get first index '0'
chicago['Name']=chicago['Name'].str.split(', ').str.get(1) #get second index '1'

chicago=chicago[['Name', 'Surname', 'Position_Title', 'Department', 'Annual_Salary']]

chicago.head(5)

Unnamed: 0,Name,Surname,Position_Title,Department,Annual_Salary
0,Elvia J,Aaron,Water Rate Taker,Water Mgmnt,90744.0
1,Jeffery M,Aaron,Police Officer,Police,84450.0
2,Karina,Aaron,Police Officer,Police,84450.0
3,Kimberlei R,Aaron,Chief Contract Expediter,General Services,89880.0
4,Vicente M,Abad Jr,Civil Engineer Iv,Water Mgmnt,106836.0


In [550]:
chicago['Surname'].value_counts()

Williams     293
Johnson      244
Smith        241
Brown        185
Jones        183
            ... 
Horkavy        1
Horn           1
Horne Jr       1
Horner         1
Zyskowski      1
Name: Surname, Length: 13829, dtype: int64