### Solve the following tasks:

In [2]:
import numpy as np  
import pandas as pd

01. Create a series from a list, numpy array and dict.

In [5]:
# Inputs
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# Solution
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser3.head())

s    4
a    7
d    6
dtype: int64


2. How to combine many series to form a dataframe?
Note: Combine ser1 and ser2 to form a dataframe.

In [4]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [5]:
# Solution 1
df = pd.concat([ser1, ser2], axis=1)

# Solution 2
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
print(df.head())

  col1  col2
0    a     0
1    b     1
2    c     2
3    e     3
4    d     4


3.  Also, give a name to the series ser calling it ‘alphabets’.

In [6]:
# Input
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# Solution
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

4. How to get the items of series A not present in series B?

In [10]:
# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser_u = pd.Series(np.union1d(ser1, ser2))  # union
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

5. 

In [None]:
# Input
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

# Solution
ser.value_counts()

6. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [40]:
# Input
ser = pd.Series(np.random.randint(1, 5, [12])) #This module implements pseudo-random number generators for various distributions.

# Solution
print("Top 2 Freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser

Top 2 Freq: 3    5
1    4
4    3
dtype: int64


0         1
1         3
2         3
3     Other
4         3
5     Other
6         1
7         3
8     Other
9         3
10        1
11        1
dtype: object

7. How to compute the mean squared error on a truth and predicted series?

In [13]:
# Input
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

# Solution
np.mean((truth-pred)**2)

0.134406881922369

8. How to change the order of columns of a dataframe?

    - In df, interchange columns 'a' and 'c'.
    - Create a generic function to interchange two columns, without hardcoding column names.
    - Sort the columns in reverse alphabetical order, that is colume 'e' first through column 'a' last.

In [45]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
#Return evenly spaced values within a given interval.
df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [46]:
# Solution Q1
df[list('cbade')]

Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


In [17]:
# Solution Q2 - No hard coding
def switch_columns(df, col1=None, col2=None):
    colnames = df.columns.tolist()
    i1, i2 = colnames.index(col1), colnames.index(col2)
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    return df[colnames]

df1 = switch_columns(df, 'a', 'c')

df1

Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


9. How to format or suppress scientific notations in a pandas dataframe?

In [20]:
# Input
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])

In [21]:
# Solution 1: Rounding
df.round(4)

Unnamed: 0,random
0,0.1459
1,0.0016
2,0.0
3,0.0068


In [26]:
# Solution 2: Assign display.float_format
pd.options.display.float_format = '{:.4f}'.format
print(df)

# Reset/undo float formatting
pd.options.display.float_format = None

   random
0  0.6380
1  0.4966
2  0.1643
3  0.8312


10. How to format all the values in a dataframe as percentages? Format the values in column 'random' of df as percentages.

In [None]:
# Desired Output

#>      random
#> 0    68.97%
#> 1    95.72%
#> 2    15.91%
#> 3    2.10%

In [25]:
# Input
df = pd.DataFrame(np.random.random(4), columns=['random'])

# Solution
out = df.style.format({
    'random': '{0:.2%}'.format,
})

out

Unnamed: 0,random
0,63.80%
1,49.66%
2,16.43%
3,83.12%


11. How to find the position of the nth largest value greater than a given value?

In [16]:
# Input
ser = pd.Series(np.random.randint(1, 100, 15))

# Solution
print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
np.argwhere(ser > ser.mean())[1] #Find the indices of array elements that are non-zero, grouped by element.

ser:  [44, 10, 59, 7, 87, 58, 22, 10, 29, 35, 36, 24, 55, 44, 61] mean:  39


array([2], dtype=int64)

12. How to create one-hot encodings of a categorical variable (dummy variables)?

In [42]:
# Input
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))

df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [44]:
# Solution
df_onehot = pd.concat([pd.get_dummies(df['a']), df[list('bcde')]], axis=1)
print(df_onehot)


   0  5  10  15  20   b   c   d   e
0  1  0   0   0   0   1   2   3   4
1  0  1   0   0   0   6   7   8   9
2  0  0   1   0   0  11  12  13  14
3  0  0   0   1   0  16  17  18  19
4  0  0   0   0   1  21  22  23  24


13. How to normalize all columns in a dataframe?
    - Normalize all columns of df by subtracting the column mean and divide by standard deviation.
    - Range all columns of df such that the minimum value in each column is 0 and max is 1.


In [None]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

# Solution Q1
out1 = df.apply(lambda x: ((x - x.mean())/x.std()).round(2)) #(A lambda function is a small anonymous function.)
print('Solution Q1\n',out1)

# Solution Q2
out2 = df.apply(lambda x: ((x.max() - x)/(x.max() - x.min())).round(2))
print('Solution Q2\n', out2)  

14. How to get the particular group of a groupby dataframe by key?

In [None]:
# Input
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})

df_grouped = df.groupby(['col1'])

# Solution 1
df_grouped.get_group('apple')

# Solution 2
for i, dff in df_grouped:
    if i == 'apple':
        print(dff)