## Text methods for string data

In [1]:
import numpy as np
import pandas as pd

In [2]:
email='gautam@gmail.com'

In [3]:
email.split('@')

['gautam', 'gmail.com']

In [8]:
names=pd.Series(['Gautam','pankaj','nikhil','jatin','7'])

In [9]:
names

0    Gautam
1    pankaj
2    nikhil
3     jatin
4         7
dtype: object

In [10]:
names.str.upper()

0    GAUTAM
1    PANKAJ
2    NIKHIL
3     JATIN
4         7
dtype: object

In [11]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [12]:
tech_finance=['GOOG,APPL,AMZN','JPM,BAC,GS']

In [13]:
tickers=pd.Series(tech_finance)

In [14]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [16]:
tickers.str.split(',') # it takes each index as separate string

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [18]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [20]:
 tickers.str.split(',')[0]

0    G
1    J
dtype: object

In [21]:
#  to return a dataframe
tickers.str.split(',',expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [24]:
#  correcting messy words
names=['gautam   ','  pank;aj ','   jatin    ']

In [25]:
messy_names=pd.Series(names)

In [26]:
messy_names

0       gautam   
1        pank;aj 
2       jatin    
dtype: object

In [28]:
messy_names.str.replace(';','')

0       gautam   
1         pankaj 
2       jatin    
dtype: object

In [33]:
messy_names.str.replace(';','').str.strip()   # to remove extra spaces on both sides

0    gautam
1    pankaj
2     jatin
dtype: object

In [34]:
messy_names.str.replace(';','').str.strip().str.capitalize()

0    Gautam
1    Pankaj
2     Jatin
dtype: object

In [36]:
#  Custom function
def cleanup(name):
    name=name.replace(';','')
    name=name.strip()
    name=name.capitalize()
    return name

In [37]:
messy_names.apply(cleanup)

0    Gautam
1    Pankaj
2     Jatin
dtype: object

## Time methods for Date and Time Data

In [38]:
from datetime import datetime

In [39]:
myyear=2015
mymonth=1
myday=1
myhour=2
mymin=30
mysec=15

In [40]:
mydate=datetime(myyear,mymonth,myday)

In [41]:
mydate

datetime.datetime(2015, 1, 1, 0, 0)

In [42]:
mydate=datetime(myyear,mymonth,myday,myhour,mymin,mysec)

In [43]:
mydate

datetime.datetime(2015, 1, 1, 2, 30, 15)

In [109]:
myser=pd.Series(['Dec 14, 2002','2002-02-02',None])

In [110]:
myser

0    Dec 14, 2002
1      2002-02-02
2            None
dtype: object

In [113]:
# myser[0].year

In [114]:
pd.to_datetime(myser,format='mixed')

0   2002-12-14
1   2002-02-02
2          NaT
dtype: datetime64[ns]

In [115]:
euro_date='14-12-2002'

In [117]:
pd.to_datetime(euro_date,dayfirst=True)

Timestamp('2002-12-14 00:00:00')

In [118]:
#  what if you have mix of euro and american dates ->firstly clean up the data then proceed

In [119]:
style_date='14--Dec--2002'

In [121]:
pd.to_datetime(style_date,format='%d--%b--%Y')  # specify format to custom input string

Timestamp('2002-12-14 00:00:00')

In [122]:
#  pandas have builtin method to detect date & time
custom_date="14th of Dec 2002"

In [123]:
pd.to_datetime(custom_date)

Timestamp('2002-12-14 00:00:00')

In [124]:
#  Reading CSV and applying date time
sales=pd.read_csv('RetailSales_BeerWineLiquor.csv')

In [125]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [126]:
sales['DATE']

0      1992-01-01
1      1992-02-01
2      1992-03-01
3      1992-04-01
4      1992-05-01
          ...    
335    2019-12-01
336    2020-01-01
337    2020-02-01
338    2020-03-01
339    2020-04-01
Name: DATE, Length: 340, dtype: object

In [127]:
#  converting dtype -> object to dtype->datetime
sales['DATE']=pd.to_datetime(sales['DATE'])

In [128]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [129]:
sales['DATE']

0     1992-01-01
1     1992-02-01
2     1992-03-01
3     1992-04-01
4     1992-05-01
         ...    
335   2019-12-01
336   2020-01-01
337   2020-02-01
338   2020-03-01
339   2020-04-01
Name: DATE, Length: 340, dtype: datetime64[ns]

In [130]:
# 2nd command to automatically parse dates
sales=pd.read_csv('RetailSales_BeerWineLiquor.csv',parse_dates=[0])  # Date objec is at column 0

In [131]:
sales['DATE']

0     1992-01-01
1     1992-02-01
2     1992-03-01
3     1992-04-01
4     1992-05-01
         ...    
335   2019-12-01
336   2020-01-01
337   2020-02-01
338   2020-03-01
339   2020-04-01
Name: DATE, Length: 340, dtype: datetime64[ns]

In [132]:
#  resampling(group by) when actual time series has time as index

In [133]:
sales=sales.set_index('DATE')

In [134]:
sales

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-01-01,1509
1992-02-01,1541
1992-03-01,1597
1992-04-01,1675
1992-05-01,1822
...,...
2019-12-01,6630
2020-01-01,4388
2020-02-01,4533
2020-03-01,5562


In [137]:
sales.resample(rule='A').mean()  # group by year (year rule code -> A)

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-12-31,1807.25
1993-12-31,1794.833333
1994-12-31,1841.75
1995-12-31,1833.916667
1996-12-31,1929.75
1997-12-31,2006.75
1998-12-31,2115.166667
1999-12-31,2206.333333
2000-12-31,2375.583333
2001-12-31,2468.416667


In [141]:
sales=pd.read_csv('RetailSales_BeerWineLiquor.csv',parse_dates=[0])

In [142]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [146]:
sales['DATE'].dt.year  # just as str as string object

0      1992
1      1992
2      1992
3      1992
4      1992
       ... 
335    2019
336    2020
337    2020
338    2020
339    2020
Name: DATE, Length: 340, dtype: int32