In [2]:
###1. How to import pandas and check the version?
import pandas as pd

In [7]:
###2. How to create a series from a list, numpy array and dict?

import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser1)

0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object


In [15]:
###3. How to convert the index of a series into a column of a dataframe?

mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))  #zip: A zip object yielding tuples until an input is exhausted
ser = pd.Series(mydict)

df = ser.to_frame().reset_index()  #ser.to_frame(name=None): coverts series to dataframe; reset_index:  the old index is added as a column, and a new sequential index is used
print(df)

   index   0
0      a   0
1      b   1
2      c   2
3      e   3
4      d   4
5      f   5
6      g   6
7      h   7
8      i   8
9      j   9
10     k  10
11     l  11
12     m  12
13     n  13
14     o  14
15     p  15
16     q  16
17     r  17
18     s  18
19     t  19
20     u  20
21     v  21
22     w  22
23     x  23
24     y  24
25     z  25


In [19]:
###4. How to combine many series to form a dataframe?

import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

df=pd.DataFrame({'Letters': ser1, 'Numbers': ser2})
print(df)

#df = pd.concat([ser1, ser2], axis=1)

   Letters  Numbers
0        a        0
1        b        1
2        c        2
3        e        3
4        d        4
5        f        5
6        g        6
7        h        7
8        i        8
9        j        9
10       k       10
11       l       11
12       m       12
13       n       13
14       o       14
15       p       15
16       q       16
17       r       17
18       s       18
19       t       19
20       u       20
21       v       21
22       w       22
23       x       23
24       y       24
25       z       25


In [25]:
###5. How to assign name to the series’ index?

ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

ser.name='letters'
ser

0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
Name: letters, dtype: object

In [38]:
###6. How to get the items of series A not present in series B?

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8]) 

print('\n', ser1.isin(ser2)) #isin: Whether elements in Series are contained in values; poutpu boolean
print('\n',~ser1.isin(ser2)) # ~ negates/inverts 
ser1[~ser1.isin(ser2)] #


 0    False
1    False
2    False
3     True
4     True
dtype: bool

 0     True
1     True
2     True
3    False
4    False
dtype: bool


0    1
1    2
2    3
dtype: int64

In [39]:
###7. How to get the items not common to both series A and series B?

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

ser_union = pd.Series(np.union1d(ser1, ser2))  #np.union1d union Find the union of two arrays.
ser_intersect = pd.Series(np.intersect1d(ser1, ser2))  # intersect: Return the sorted, unique values that are in both of the input arrays.
ser_union[~ser_union.isin(ser_intersect)]


0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [58]:
###8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

ser = pd.Series(np.random.normal(10, 5, 25))

#np.percentile(ser, q=[0, 25, 50, 75, 100])

print(ser)
print('Minimum=', ser.min())
print('25th percentile=', ser.quantile(q=0.25))
print('Median=', ser.median())
print('75th percentile=', ser.quantile(q=0.75))
print('Maximum=', ser.max())


0     14.795260
1     15.370896
2      9.467883
3     10.391811
4      8.432052
5      7.431662
6     10.485203
7      9.290565
8     12.680049
9     20.965166
10     8.370685
11    15.032853
12    12.390250
13    16.497469
14     6.535550
15     8.201937
16    10.496849
17     9.675682
18     9.533651
19    11.937964
20     9.707716
21     9.619145
22     9.181846
23     9.298904
24    12.211635
dtype: float64
Minimum= 6.535550240832145
25th percentile= 9.290564584376176
Median= 9.707715623383008
75th percentile= 12.390249873304608
Maximum= 20.965165641106132


In [55]:
###9. How to get frequency counts of unique items of a series?

ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

ser.value_counts()

d    5
g    5
c    5
b    5
e    4
a    2
h    2
f    2
dtype: int64

In [59]:
###10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

np.random.RandomState(100)   #fixes random state, output always same numbers
ser = pd.Series(np.random.randint(1, 5, [12]))

print("Top 2 most frequent numbers are:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 most frequent numbers are: 2    4
4    4
1    3
3    1
dtype: int64


0         2
1     Other
2     Other
3         4
4         2
5         2
6     Other
7         4
8     Other
9         4
10        4
11        2
dtype: object

In [63]:
###11. How to bin a numeric series to 10 groups of equal size?
#using quantiles to equally distribute numbers, for 10 groups (0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0), for 5 groups (0, .25, .5, .75, 1.)


ser = pd.Series(np.random.random(20))
print(ser)

 #Quantile-based discretization function (pandas.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
pd.qcut(ser, q=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],   
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])

0     0.380130
1     0.023064
2     0.313854
3     0.407468
4     0.274567
5     0.767278
6     0.289124
7     0.322816
8     0.869532
9     0.457191
10    0.844208
11    0.665293
12    0.345669
13    0.928985
14    0.419213
15    0.175442
16    0.930020
17    0.411809
18    0.651095
19    0.258372
dtype: float64


0      5th
1      1st
2      3rd
3      5th
4      2nd
5      8th
6      3rd
7      4th
8      9th
9      7th
10     9th
11     8th
12     4th
13    10th
14     6th
15     1st
16    10th
17     6th
18     7th
19     2nd
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

In [69]:
pd.qcut(ser, 10, labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])

0      5th
1      1st
2      3rd
3      5th
4      2nd
5      8th
6      3rd
7      4th
8      9th
9      7th
10     9th
11     8th
12     4th
13    10th
14     6th
15     1st
16    10th
17     6th
18     7th
19     2nd
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

In [74]:
###12. How to convert a numpy array to a dataframe of given shape? (L1)

ser = pd.Series(np.random.randint(1, 10, 35))
#print(ser)
df = pd.DataFrame(ser.values.reshape(5,7))  #reshape size must fit to total count of element, i.e 35-5*7
print(df)

   0  1  2  3  4  5  6
0  9  2  4  6  1  6  5
1  7  2  3  8  3  3  2
2  4  2  9  2  4  4  9
3  7  8  7  8  1  6  7
4  9  2  9  2  1  4  1


In [108]:
###13. How to find the positions of numbers that are multiples of 3 from a series?

ser = pd.Series(np.random.randint(1, 10, 7))
b=(ser % 3 ==0)
print(b)

np.where(b)  

0    False
1    False
2     True
3    False
4     True
5    False
6     True
dtype: bool


(array([2, 4, 6], dtype=int64),)

In [110]:
###14. How to extract items at given positions from a series

ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

ser.take(pos)  #Return the elements in the given *positional* indices along an axis.

0     a
4     e
8     i
14    o
20    u
dtype: object

In [111]:
###15. How to stack two series vertically and horizontally ?

ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

ser_vert=ser1.append(ser2) #vertically, append ser1 with ser2

df_hori=pd.concat([ser1, ser2], axis=1)  #horizontally

print(ser_vert)
print('\n',df_hori)


0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

    0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


In [117]:
###16. How to get the positions of items of series A in another series B?

ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

f=[pd.Index(ser1).get_loc(i) for i in ser2]  #get location for requested label

print(f, type(f))

[5, 4, 0, 8] <class 'list'>


In [118]:
###17. How to compute the mean squared error on a truth and predicted series?

truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.3368967691822092

In [130]:
###18. How to convert the first character of each element in a series to uppercase?

ser = pd.Series(['how', 'to', 'kick', 'ass?'])

ser2=pd.Series([i.title() for i in ser]) #pd.Series.str.title - Converts first character of each word to uppercase and remaining to lowercase.
print(ser2)


ser3=ser.str.title() #easier solution
print('\n',ser3)

0     How
1      To
2    Kick
3    Ass?
dtype: object

 0     How
1      To
2    Kick
3    Ass?
dtype: object


In [134]:
###19. How to calculate the number of characters in each word in a series?

ser = pd.Series(['how', 'to', 'kick', 'ass?'])

print(ser.map(lambda x: len(x)))

print('\n',ser.str.len()) #easier solution

0    3
1    2
2    4
3    4
dtype: int64

 0    3
1    2
2    4
3    4
dtype: int64


In [135]:
###20. How to compute difference of differences between consequtive numbers of a series?

ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

print(ser.diff().tolist())  #tolist--convert a series to list.
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


In [136]:
###21. How to convert a series of date-strings to a timeseries?

ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

pd.to_datetime(ser) ##Convert argument to datetime.

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [149]:
###22. How to get the day of month, week number, day of year and day of week from a series of date strings?

ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

from dateutil.parser import parse  #parser module can parse datetime strings in many more formats

ser_parsed = ser.map(lambda x: parse(x))

print("Date: ", ser_parsed.dt.day.tolist())

print("Week number: ", ser_parsed.dt.weekofyear.tolist())  #figure out how to use isocalender().week

print("Day number of year: ", ser_parsed.dt.dayofyear.tolist())

print("Day of week: ", ser_parsed.dt.weekday.tolist())

Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  [4, 2, 5, 3, 0, 5]


  print("Week number: ", ser_parsed.dt.weekofyear.tolist())  #figure out how to use isocalender().week


In [152]:
###23. How to convert year-month string to dates corresponding to the 4th day of the month?

ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

from dateutil.parser import parse

ser_parsed = ser.map(lambda x: parse(x))
ser_date = ser_parsed.dt.year.astype('str') + '-' + ser_parsed.dt.month.astype('str') + '-' + '04'
ser_date

0    2010-1-04
1    2011-2-04
2    2012-3-04
dtype: object

In [153]:
###24. How to filter words that contain atleast 2 vowels from a series?

ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

from collections import Counter

filter = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[filter]

0     Apple
1    Orange
4     Money
dtype: object

In [154]:
###25. How to filter valid emails from a series?

emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

import re

filter = emails.map(lambda x: bool(re.match(pattern, x)))
emails[filter]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [157]:
###26. How to get the mean of a series grouped by another series?

fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print('weights:',weights.tolist())
print('fruits:',fruit.tolist())

weights.groupby(fruit).mean()   #Group Series using a mapper or by a Series of columns.

weights: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
fruits: ['apple', 'carrot', 'banana', 'carrot', 'apple', 'banana', 'carrot', 'carrot', 'apple', 'banana']


apple     5.000000
banana    6.333333
carrot    5.250000
dtype: float64

In [168]:
###27. How to compute the euclidean distance between two series?

#length of a line segment between the two points. 
#It can be calculated from the Cartesian coordinates of the points using the Pythagorean theorem,
#therefore occasionally being called the Pythagorean distance.

p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

euc_dist=(sum((p - q)**2))**.5  # d(p,q)={\sqrt {(p-q)^{2}}}
print(euc_dist)

#np.linalg.norm(p-q)

18.16590212458495


18.16590212458495

In [169]:
###28. How to find all the local maxima (or peaks) in a numeric series?

ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
 
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

1