In [None]:
import numpy as np
import pandas as pd

### 24. Filter words that contain at least 2 vowels from a series

최소 두 모음이 들어간 단어

In [None]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in list('aeiou')])>= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

### 25. Filter valid emails from a series

In [None]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

import re
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

### 26. Get the mean of a series grouped by another series

In [None]:
fruit = pd.Series(np.random.choice(['apple','banana','carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

weights.groupby(fruit).mean()

apple      5.0
banana    10.0
carrot     5.0
dtype: float64

### 27. Compute the euclidean distance between two series

In [None]:
p = pd.Series([1,2,3,4,5,6,7,8,9,10])
q = pd.Series([10,9,8,7,6,5,4,3,2,1])

sum((p-q)**2)**0.5

18.16590212458495

### 28. Find all the local maxima in a numeric series

In [None]:
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

dd = np.diff(np.sign(np.diff(ser)))
print(np.diff(ser))
print(np.sign(np.diff(ser)))
print(np.diff(np.sign(np.diff(ser))))

print(np.where(dd==-2))
peak_locs = np.where(dd ==-2)[0] + 1
print(peak_locs)

[ 8 -7  1  5  1 -8  5 -4]
[ 1 -1  1  1  1 -1  1 -1]
[-2  2  0  0 -2  2 -2]
(array([0, 4, 6]),)
[1 5 7]


### 29. Replace missing spaces in a string with the least frequent character

In [None]:
my_str = 'dbc deb abed gade'

ser = pd.Series(list(my_str))
freq = ser.value_counts()
least_freq = freq.dropna().index[-1]
print(least_freq)

"".join(ser.replace(' ', least_freq ))

c


'dbccdebcabedcgade'

### 30. Create a TimeSeries starting '2000-01-01' and 10 weekends [saturdays] after that having random numbers as values

In [None]:
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    2
2000-01-08    7
2000-01-15    9
2000-01-22    2
2000-01-29    9
2000-02-05    7
2000-02-12    3
2000-02-19    4
2000-02-26    2
2000-03-04    2
Freq: W-SAT, dtype: int64

### 31. How to fill an intermittent time series so all missing dates show up with values of previous non-missing date?

In [None]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01','2000-01-03','2000-01-06','2000-01-08']))
print(ser)
ser1=ser.resample('D').ffill() # fill with previous value
print(ser1)
ser2=ser.resample('D').bfill() # fill with next value
print(ser2)
ser3=ser.resample('D').bfill().ffill() # fill next else previous value
print(ser3)

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64
2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     3.0
2000-01-05     3.0
2000-01-06     3.0
2000-01-07     NaN
2000-01-08     NaN
Freq: D, dtype: float64
2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     3.0
2000-01-05     3.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     3.0
Freq: D, dtype: float64


### 32. Compute the autocorrelations of a numeric series

In [None]:
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
print(ser)
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
print(autocorrelations[1:])
print('Lag having highest correlation: ', np.argmax(np.abs(autocorrelations[1:]))+1)

0     10.516299
1     -1.493415
2      7.429744
3     -0.679119
4      1.914724
5      3.891275
6     28.292952
7     -9.872994
8     24.941538
9     10.925252
10    35.884891
11    10.339341
12    12.883537
13    13.855696
14    25.214551
15    18.811838
16     4.204122
17    24.771309
18    15.553677
19    32.433569
dtype: float64
[-0.16, 0.45, -0.16, 0.5, 0.0, 0.04, -0.09, 0.03, 0.44, -0.27]
Lag having highest correlation:  4


### 33. Import only every nth row from a csv file to create a dataframe

Import every 50th row of BostonHousing dataset

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)
df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)

df2 = df2.transpose()
df2

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
50,0.08873,21.0,5.64,0.0,0.439,5.963,45.7,6.8147,4.0,243.0,16.8,395.56,13.45,19.7
100,0.14866,0.0,8.56,0.0,0.52,6.727,79.9,2.7778,5.0,384.0,20.9,394.76,9.42,27.5
150,1.6566,0.0,19.58,0.0,0.871,6.122,97.3,1.618,5.0,403.0,14.7,372.8,14.1,21.5
200,0.01778,95.0,1.47,0.0,0.403,7.135,13.9,7.6534,3.0,402.0,17.0,384.3,4.45,32.9
250,0.1403,22.0,5.86,0.0,0.431,6.487,13.0,7.3967,7.0,330.0,19.1,396.28,5.9,24.4
300,0.04417,70.0,2.24,0.0,0.4,6.871,47.4,7.8278,5.0,358.0,14.8,390.86,6.07,24.8
350,0.06211,40.0,1.25,0.0,0.429,6.49,44.4,8.7921,1.0,335.0,19.7,396.9,5.98,22.9
400,25.0461,0.0,18.1,0.0,0.693,5.987,100.0,1.5888,24.0,666.0,20.2,396.9,26.77,5.6
450,6.71772,0.0,18.1,0.0,0.713,6.749,92.6,2.3236,24.0,666.0,20.2,0.32,17.44,13.4
