In [1]:
import pandas as pd
import numpy as np

In [2]:
# create a series from a list, numpy array and dict
my_list = list('abcedfghijklmnopqrstuvwxyz')
my_arr = np.arange(len(my_list))
my_dict = dict(zip(my_list, my_arr))

ser = pd.Series(my_dict)

ser.head(5)

a    0
b    1
c    2
e    3
d    4
dtype: int32

In [3]:
# convert the index of a series into a column of a dataframe
df = pd.DataFrame(ser)
df.head(5)

Unnamed: 0,0
a,0
b,1
c,2
e,3
d,4


In [4]:
# combine many series to form a dataframe
ser1 = pd.Series(my_list)
ser2 = pd.Series(np.arange(len(my_list)))

df = pd.concat([ser1, ser2], axis=1)
df.head(5)

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


In [5]:
# assign name to the series’ index
ser1.name = 'alphabets'
ser1.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

In [6]:
# get the items of series A not present in series B
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

fin = ser1[~ser1.isin(ser2)]

fin

0    1
1    2
2    3
dtype: int64

In [7]:
# get the items not common to both series A and series B
union = pd.Series(np.union1d(ser1, ser2))

intersect = pd.Series(np.intersect1d(ser1, ser2))

notcommonseries = union[~union.isin(intersect)]

notcommonseries

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [8]:
# get the minimum, 25th percentile, median, 75th, and max of a numeric series
ser = pd.Series(np.random.normal(10, 5, 25))
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([-1.31987841,  6.82367297, 10.74022125, 13.10794353, 24.84288396])

In [9]:
# get frequency counts of unique items of a series
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser.value_counts()

a    6
e    5
h    5
d    4
f    4
b    3
c    2
g    1
dtype: int64

In [10]:
# keep only top 2 most frequent values as it is and replace everything else as ‘Other’
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

0     Other
1         3
2         4
3         3
4         3
5         3
6         4
7         3
8         3
9         4
10    Other
11        3
dtype: object

In [11]:
# bin a numeric series to 10 groups of equal size
ser = pd.Series(np.random.random(20))
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0    7th
1    9th
2    9th
3    3rd
4    5th
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

In [12]:
# convert a numpy array to a dataframe of given shape
ser = pd.Series(np.random.randint(1, 10, 35))

df = pd.DataFrame(ser.values.reshape(7,5))
df

Unnamed: 0,0,1,2,3,4
0,6,1,8,6,6
1,3,3,7,8,3
2,9,9,7,4,3
3,6,8,4,4,9
4,8,4,6,6,6
5,7,2,1,3,4
6,8,6,5,9,5


In [13]:
# find the positions of numbers that are multiples of 3 from a series
ser = pd.Series(np.random.randint(1, 10, 7))
result = np.where(ser % 3==0)
result

(array([0, 2, 3, 4, 6], dtype=int64),)

In [14]:
# extract items at given positions from a series
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

In [15]:
# stack two series vertically and horizontally
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Vertical
df = pd.concat([ser1, ser2], axis=0)
# Horizontal
df2 = pd.concat([ser1, ser2], axis=1)

print(df)
print(df2)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object
   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


In [16]:
# get the positions of items of series A in another series B
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

result = [pd.Index(ser1).get_loc(i) for i in ser2]

result

[5, 4, 0, 8]

In [17]:
# compute the mean squared error on a truth and predicted series
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.4417924277392851

In [18]:
# convert the first character of each element in a series to uppercase
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

result = ser.map(lambda x: x.title())

result

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [19]:
# calculate the number of characters in each word in a series
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

result = ser.map(lambda x: len(x))
result

0    3
1    2
2    4
3    4
dtype: int64

In [20]:
# compute difference of differences between consequtive numbers of a series
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


In [21]:
# convert a series of date-strings to a timeseries
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

result = pd.to_datetime(ser)

result

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [22]:
# get the day of month, week number, day of year and day of week from a series of date strings
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

from dateutil.parser import parse

ser_result = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_result.dt.day.tolist())

# week number
print("Week number: ", ser_result.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_result.dt.dayofyear.tolist())

# day of week
print("Day of week: ", list(ser_result.dt.day_name()))


Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


  print("Week number: ", ser_result.dt.weekofyear.tolist())


In [23]:
# convert year-month string to dates corresponding to the 4th day of the month
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

from dateutil.parser import parse

result = ser.map(lambda d: parse('4 ' + d))

result

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

In [24]:
# filter words that contain atleast 2 vowels from a series

ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

result = ser[ser.str.count('(?i)[aeiou]') >=2]

result

0     Apple
1    Orange
4     Money
dtype: object

In [25]:
#  filter valid emails from a series
import re as regex

emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

mapped_result = emails.map(lambda i: bool(regex.match(pattern, i)))

emails[mapped_result]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [26]:
# get the mean of a series grouped by another series
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())
#> [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
#> ['banana', 'carrot', 'apple', 'carrot', 'carrot', 'apple', 'banana', 'carrot', 'apple', 'carrot']
weights.groupby(fruit).mean()

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['banana', 'carrot', 'banana', 'carrot', 'apple', 'carrot', 'banana', 'carrot', 'apple', 'banana']


apple     7.00
banana    5.25
carrot    5.00
dtype: float64

In [37]:
# compute the euclidean distance between two series

p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

result = sum((p - q) ** 2) ** 0.5

result

18.16590212458495

In [38]:
#  find all the local maxima (or peaks) in a numeric series

ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

array([1, 5, 7], dtype=int64)

In [45]:
# replace missing spaces in a string with the least frequent character

my_str = 'dbc deb abed gade'

ser = pd.Series(list(my_str))

element_count = ser.value_counts()

current_freq = element_count.dropna().index[-1]

result = "".join(ser.replace(' ',current_freq))

print(f'{result} # least frequent is c')

dbcgdebgabedggade # least frequent is c


In [47]:
# create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values
dates = pd.date_range('2000-01-01', periods = 10, freq='W-SAT')
values = np.random.randint(0, 10, size = 10)

df = pd.Series(values, index=dates)
print(df)

2000-01-01    2
2000-01-08    2
2000-01-15    2
2000-01-22    3
2000-01-29    6
2000-02-05    5
2000-02-12    2
2000-02-19    3
2000-02-26    7
2000-03-04    8
Freq: W-SAT, dtype: int32


In [60]:
# compute the autocorrelations of a numeric series

ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]

print(autocorrelations[1:])

[0.1, -0.29, 0.32, 0.13, 0.06, 0.0, -0.29, 0.19, 0.28, -0.35]


In [71]:
# create a dataframe with rows as strides from a given series
L = pd.Series(range(15))

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])

gen_strides(L, stride_len=2, window_len=4)

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13]], dtype=int64)