# Working with Text Data

In [1]:
"hello".upper()

'HELLO'

## Vectorized String Functions

In [2]:
import pandas as pd
import numpy as np

In [3]:
data=["tim","Kate","SUSan",np.nan,"aLEX"]

In [4]:
name=pd.Series(data)

In [5]:
name.str.capitalize()

0      Tim
1     Kate
2    Susan
3      NaN
4     Alex
dtype: object

In [6]:
name.str.lower()

0      tim
1     kate
2    susan
3      NaN
4     alex
dtype: object

In [7]:
name.str.len()

0    3.0
1    4.0
2    5.0
3    NaN
4    4.0
dtype: float64

In [8]:
name.str.startswith("a")

0    False
1    False
2    False
3      NaN
4     True
dtype: object

In [9]:
df=pd.DataFrame(
    np.random.randn(3,2),
    columns=["Column A","Column B"],
    index=range(3))
df

Unnamed: 0,Column A,Column B
0,-0.459978,0.200495
1,0.739367,-2.557691
2,0.371356,0.086189


In [10]:
df.columns

Index(['Column A', 'Column B'], dtype='object')

In [11]:
df.columns.str.lower().str.replace(" ","_")

Index(['column_a', 'column_b'], dtype='object')

In [12]:
s=pd.Series(["a_b_c","c_d_e",np.nan,"f_g_h"])
s

0    a_b_c
1    c_d_e
2      NaN
3    f_g_h
dtype: object

In [13]:
s.str.split("_").str[1]

0      b
1      d
2    NaN
3      g
dtype: object

In [14]:
s.str.split("_",expand=True,n=1)

Unnamed: 0,0,1
0,a,b_c
1,c,d_e
2,,
3,f,g_h


In [15]:
money=pd.Series(["15","-$20","$30000"])
money

0        15
1      -$20
2    $30000
dtype: object

In [16]:
money.str.replace("-\$","")

0        15
1        20
2    $30000
dtype: object

In [17]:
money.str.replace("-\$","-")

0        15
1       -20
2    $30000
dtype: object

You can use google or pandas.pydata.org to see the string methods of Pandas documentation.

In [18]:
film=pd.read_csv("http://bit.ly/imdbratings")

In [19]:
film.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [20]:
film.title.str.upper()

0                             THE SHAWSHANK REDEMPTION
1                                        THE GODFATHER
2                               THE GODFATHER: PART II
3                                      THE DARK KNIGHT
4                                         PULP FICTION
                            ...                       
974                                            TOOTSIE
975                        BACK TO THE FUTURE PART III
976    MASTER AND COMMANDER: THE FAR SIDE OF THE WORLD
977                                        POLTERGEIST
978                                        WALL STREET
Name: title, Length: 979, dtype: object

In [21]:
film.columns=film.columns.str.capitalize()

In [22]:
film.head()

Unnamed: 0,Star_rating,Title,Content_rating,Genre,Duration,Actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [23]:
film[film.Actors_list.str.contains(
    "Brad Pitt")]

Unnamed: 0,Star_rating,Title,Content_rating,Genre,Duration,Actors_list
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
24,8.7,Se7en,R,Drama,127,"[u'Morgan Freeman', u'Brad Pitt', u'Kevin Spac..."
106,8.3,Snatch.,R,Comedy,102,"[u'Jason Statham', u'Brad Pitt', u'Benicio Del..."
114,8.3,Inglourious Basterds,R,Adventure,153,"[u'Brad Pitt', u'Diane Kruger', u'Eli Roth']"
264,8.1,Twelve Monkeys,R,Mystery,129,"[u'Bruce Willis', u'Madeleine Stowe', u'Brad P..."
508,7.8,The Curious Case of Benjamin Button,PG-13,Drama,166,"[u'Brad Pitt', u'Cate Blanchett', u'Tilda Swin..."
577,7.8,Ocean's Eleven,PG-13,Crime,116,"[u'George Clooney', u'Brad Pitt', u'Julia Robe..."
683,7.7,Fury,R,Action,134,"[u'Brad Pitt', u'Shia LaBeouf', u'Logan Lerman']"
776,7.6,Moneyball,PG-13,Biography,133,"[u'Brad Pitt', u'Robin Wright', u'Jonah Hill']"
779,7.6,Interview with the Vampire: The Vampire Chroni...,R,Horror,123,"[u'Brad Pitt', u'Tom Cruise', u'Antonio Bander..."


In [24]:
film.Actors_list.str.replace("[","")

0      u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1         u'Marlon Brando', u'Al Pacino', u'James Caan']
2      u'Al Pacino', u'Robert De Niro', u'Robert Duva...
3      u'Christian Bale', u'Heath Ledger', u'Aaron Ec...
4      u'John Travolta', u'Uma Thurman', u'Samuel L. ...
                             ...                        
974    u'Dustin Hoffman', u'Jessica Lange', u'Teri Ga...
975    u'Michael J. Fox', u'Christopher Lloyd', u'Mar...
976    u'Russell Crowe', u'Paul Bettany', u'Billy Boyd']
977    u'JoBeth Williams', u"Heather O'Rourke", u'Cra...
978    u'Charlie Sheen', u'Michael Douglas', u'Tamara...
Name: Actors_list, Length: 979, dtype: object

In [25]:
film.Actors_list.str.replace(
    "[","").str.replace("]","")

0       u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton'
1          u'Marlon Brando', u'Al Pacino', u'James Caan'
2      u'Al Pacino', u'Robert De Niro', u'Robert Duvall'
3      u'Christian Bale', u'Heath Ledger', u'Aaron Ec...
4      u'John Travolta', u'Uma Thurman', u'Samuel L. ...
                             ...                        
974    u'Dustin Hoffman', u'Jessica Lange', u'Teri Garr'
975    u'Michael J. Fox', u'Christopher Lloyd', u'Mar...
976     u'Russell Crowe', u'Paul Bettany', u'Billy Boyd'
977    u'JoBeth Williams', u"Heather O'Rourke", u'Cra...
978    u'Charlie Sheen', u'Michael Douglas', u'Tamara...
Name: Actors_list, Length: 979, dtype: object