# Working with Text Data

This notebook is about string operations in Pandas Series.

In [1]:
# !pip install numpy
# !pip install pandas

In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(['Tommy   ', 'William Rick', 'John\n', 'ALBER@T', np.nan, '1234','SteveSmith', 34])
s

0        Tommy   
1    William Rick
2          John\n
3         ALBER@T
4             NaN
5            1234
6      SteveSmith
7              34
dtype: object

In [4]:
# lower(): Converts strings in the Series/Index to lower case.
s.str.lower()

0        tommy   
1    william rick
2          john\n
3         alber@t
4             NaN
5            1234
6      stevesmith
7             NaN
dtype: object

In [5]:
# upper(): Converts strings in the Series/Index to upper case.
s.str.upper()

0        TOMMY   
1    WILLIAM RICK
2          JOHN\n
3         ALBER@T
4             NaN
5            1234
6      STEVESMITH
7             NaN
dtype: object

In [6]:
# swapcase: swaps the case lower/upper.
s.str.swapcase()

0        tOMMY   
1    wILLIAM rICK
2          jOHN\n
3         alber@t
4             NaN
5            1234
6      sTEVEsMITH
7             NaN
dtype: object

In [7]:
# islower(): checks whether all characters in each string in the Series/Index in lower case or not. Returns Boolean
s.str.islower()

0    False
1    False
2    False
3    False
4      NaN
5    False
6    False
7      NaN
dtype: object

In [8]:
s.str.lower().str.islower()

0     True
1     True
2     True
3     True
4      NaN
5    False
6     True
7      NaN
dtype: object

In [9]:
s

0        Tommy   
1    William Rick
2          John\n
3         ALBER@T
4             NaN
5            1234
6      SteveSmith
7              34
dtype: object

In [10]:
# isupper(): checks whether all characters in each string in the Series/Index in upper case or not. Returns Boolean.
s.str.isupper()

0    False
1    False
2    False
3     True
4      NaN
5    False
6    False
7      NaN
dtype: object

In [11]:
# isnumeric(): checks whether all characters in each string in the Series/Index are numeric. Returns Boolean.
s.str.isnumeric()

0    False
1    False
2    False
3    False
4      NaN
5     True
6    False
7      NaN
dtype: object

In [12]:
# len(): Computes String length().
s.str.len()

0     8.0
1    12.0
2     5.0
3     7.0
4     NaN
5     4.0
6    10.0
7     NaN
dtype: float64

In [13]:
# strip(): Helps strip whitespace (including newline) from each string in the Series/index from both the sides.
# Observe 'John\n' was changed o 'John'
s.str.strip()

0           Tommy
1    William Rick
2            John
3         ALBER@T
4             NaN
5            1234
6      SteveSmith
7             NaN
dtype: object

In [14]:
# split(): Splits each string with the given pattern. The result is a list for each row
s.str.split(' ')

0      [Tommy, , , ]
1    [William, Rick]
2           [John\n]
3          [ALBER@T]
4                NaN
5             [1234]
6       [SteveSmith]
7                NaN
dtype: object

In [15]:
for r in s.str.split(' '):
    if type(r) == list:
        print('list with ',len(r),'elements',r)
    else: print(r)    

list with  4 elements ['Tommy', '', '', '']
list with  2 elements ['William', 'Rick']
list with  1 elements ['John\n']
list with  1 elements ['ALBER@T']
nan
list with  1 elements ['1234']
list with  1 elements ['SteveSmith']
nan


In [16]:
# cat(sep=''): concatenates the series/index elements with given separator
s = pd.Series(['Tom ',' John','Will Smith','123'])
s.str.cat(sep='_')

'Tom _ John_Will Smith_123'

In [17]:
# contains(pattern): returns a Boolean value True for each element if the substring contains in the element, else False
s.str.contains(' ')

0     True
1     True
2     True
3    False
dtype: bool

In [18]:
# replace(a,b): replaces the value a with the value b.
s.str.replace(' ','_')

0          Tom_
1         _John
2    Will_Smith
3           123
dtype: object

In [19]:
# repeat(value): repeats each element with specified number of times.
s.str.repeat(2)

0                Tom Tom 
1               John John
2    Will SmithWill Smith
3                  123123
dtype: object

In [20]:
s.str.repeat(5)

0                                 Tom Tom Tom Tom Tom 
1                             John John John John John
2    Will SmithWill SmithWill SmithWill SmithWill S...
3                                      123123123123123
dtype: object

In [21]:
# Observe that the lenght of the Serie is the same:
print(len(s))
print(len(s.str.repeat(5)))

4
4


In [22]:
# What changes is the lenght of the elements:
print(len(s[0]))
print(len(s.str.repeat(5)[0]))

4
20


In [23]:
# count(pattern): returns count of appearance of pattern in each element.
s.str.count('o')

0    1
1    1
2    0
3    0
dtype: int64

In [24]:
# startswith(pattern): returns true if the element in the Series/Index starts with the pattern.
s.str.startswith(' ')

0    False
1     True
2    False
3    False
dtype: bool

In [25]:
s.str.startswith('w')

0    False
1    False
2    False
3    False
dtype: bool

In [26]:
s.str.startswith('W')

0    False
1    False
2     True
3    False
dtype: bool

In [27]:
s.str.lower().str.startswith('w')

0    False
1    False
2     True
3    False
dtype: bool

In [28]:
# endswith(pattern): returns true if the element in the Series/Index ends with the pattern.
s.str.endswith(' ')

0     True
1    False
2    False
3    False
dtype: bool

In [29]:
# find(pattern): returns the first position of the first occurrence of the pattern. It returns -1 if the string is not found.
s.str.find('2')

0   -1
1   -1
2   -1
3    1
dtype: int64

In [30]:
s.str.find('ll')

0   -1
1   -1
2    2
3   -1
dtype: int64

In [31]:
# findall(pattern): returns a list of all occurrence of the pattern.
s.str.findall('ll')

0      []
1      []
2    [ll]
3      []
dtype: object

In [32]:
s = pd.Series(['red','orange','yellow','green','blue'])
s.str.find('e')

0    1
1    5
2    1
3    2
4    3
dtype: int64

In [33]:
s.str.findall('e')

0       [e]
1       [e]
2       [e]
3    [e, e]
4       [e]
dtype: object

In [34]:
s.str.endswith('e')

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [35]:
# get_dummies(): returns the DataFrame with One-Hot Encoded values.
country = pd.Series(['USA','Colombia','Ecuador','Rep. Dominicana','Puerto Rico'])
country.str.get_dummies()

Unnamed: 0,Colombia,Ecuador,Puerto Rico,Rep. Dominicana,USA
0,0,0,0,0,1
1,1,0,0,0,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,1,0,0


In [36]:
sex = pd.Series(['Male','Female'])
sex.str.get_dummies()

Unnamed: 0,Female,Male
0,0,1
1,1,0
