# 1. How to import pandas and check the version?

In [15]:
import pandas as pd

pd.__version__

'1.3.3'

# 2. How to create a series from a list, numpy array and dict?

In [16]:
import numpy as np

mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

ser_list = pd.Series(mylist)
ser_array = pd.Series(myarr)
ser_dict = pd.Series(mydict)



# 3. How to convert the index of a series into a column of a dataframe?

In [17]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

print(ser)
ser.index

df = ser.to_frame().reset_index()

print(df)

a     0
b     1
c     2
e     3
d     4
f     5
g     6
h     7
i     8
j     9
k    10
l    11
m    12
n    13
o    14
p    15
q    16
r    17
s    18
t    19
u    20
v    21
w    22
x    23
y    24
z    25
dtype: int32
   index   0
0      a   0
1      b   1
2      c   2
3      e   3
4      d   4
5      f   5
6      g   6
7      h   7
8      i   8
9      j   9
10     k  10
11     l  11
12     m  12
13     n  13
14     o  14
15     p  15
16     q  16
17     r  17
18     s  18
19     t  19
20     u  20
21     v  21
22     w  22
23     x  23
24     y  24
25     z  25


# 4. How to combine many series to form a dataframe?

In [18]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [19]:
df = pd.concat([ser1, ser2], axis=1)
print(df)

    0   1
0   a   0
1   b   1
2   c   2
3   e   3
4   d   4
5   f   5
6   g   6
7   h   7
8   i   8
9   j   9
10  k  10
11  l  11
12  m  12
13  n  13
14  o  14
15  p  15
16  q  16
17  r  17
18  s  18
19  t  19
20  u  20
21  v  21
22  w  22
23  x  23
24  y  24
25  z  25


In [20]:
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
print(df.head())

  col1  col2
0    a     0
1    b     1
2    c     2
3    e     3
4    d     4


# 5. How to assign name to the series’ index?

In [21]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

ser.name = "alphabets"

ser.head


<bound method NDFrame.head of 0     a
1     b
2     c
3     e
4     d
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
Name: alphabets, dtype: object>

# 6. How to get the items of series A not present in series B?

In [22]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [23]:
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

# 7. How to get the items not common to both series A and series B?

In [24]:
ser3 = pd.concat([ser1[~ser1.isin(ser2)], ser2[~ser2.isin(ser1)]])

print(ser3)

0    1
1    2
2    3
2    6
3    7
4    8
dtype: int64


# 8 How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [25]:
ser = pd.Series(np.random.normal(10, 5, 25))

print(ser)

ser_min = print(ser.min())
ser_25 = ser.quantile(q=0.25)
ser_50 = print(ser.quantile(q=0.5))
ser_75 = ser.quantile(q=0.75)
ser_max = ser.max()

np.percentile(ser, q=[0, 25, 50, 75, 100])

0     12.862814
1     17.014515
2     17.678057
3     13.034268
4      5.311458
5      6.597315
6     16.212327
7     14.153294
8     11.613813
9     10.728301
10     4.816992
11    15.312965
12    13.031797
13    11.199946
14     2.929020
15    11.397564
16    11.964564
17    12.974423
18     7.454841
19    17.974499
20    18.967128
21     1.669077
22     3.059955
23     9.037979
24    15.112724
dtype: float64
1.669077030000473
11.964563525149087


array([ 1.66907703,  7.45484058, 11.96456353, 15.11272395, 18.96712752])

# 9. How to get frequency counts of unique items of a series?

In [26]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

print(ser)


0     h
1     g
2     g
3     h
4     g
5     g
6     b
7     d
8     e
9     a
10    e
11    d
12    d
13    d
14    c
15    e
16    g
17    g
18    g
19    c
20    h
21    h
22    c
23    a
24    c
25    c
26    a
27    h
28    c
29    d
dtype: object


In [27]:
ser.value_counts()

g    7
c    6
h    5
d    5
e    3
a    3
b    1
dtype: int64

# 10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

In [28]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

In [29]:
ser[~ser.isin(ser.value_counts().index[:2])] = "Others"
ser

0          2
1          3
2          2
3     Others
4          3
5          3
6          3
7     Others
8          2
9     Others
10         3
11    Others
dtype: object

# 11. How to bin a numeric series to 10 groups of equal size?

In [30]:
ser = pd.Series(np.random.random(20))

print(ser)

0     0.302882
1     0.436645
2     0.611930
3     0.581362
4     0.814327
5     0.002903
6     0.545803
7     0.299957
8     0.459018
9     0.433219
10    0.537099
11    0.126923
12    0.696733
13    0.500696
14    0.281293
15    0.927366
16    0.542081
17    0.633054
18    0.272753
19    0.910736
dtype: float64


In [31]:
pd.qcut(ser, 10, labels = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th'])

0      3rd
1      4th
2      8th
3      7th
4      9th
5      1st
6      7th
7      3rd
8      5th
9      4th
10     6th
11     1st
12     9th
13     5th
14     2nd
15    10th
16     6th
17     8th
18     2nd
19    10th
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

# 12. How to convert a numpy array to a dataframe of given shape? (L1)

In [32]:
ser = pd.Series(np.random.randint(1, 10, 35))

df = pd.DataFrame(ser.values.reshape(7,5))

print(df)

   0  1  2  3  4
0  4  1  4  8  1
1  3  1  2  4  3
2  6  4  9  5  4
3  7  3  9  7  1
4  9  8  4  2  3
5  9  8  4  3  4
6  8  8  7  5  4


# 13. How to find the positions of numbers that are multiples of 3 from a series?

In [33]:
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

0    9
1    5
2    7
3    7
4    7
5    9
6    8
dtype: int32


In [34]:
np.argwhere(ser.to_numpy() % 3==0)

array([[0],
       [5]], dtype=int64)

In [35]:
ser % 3 == 0

0     True
1    False
2    False
3    False
4    False
5     True
6    False
dtype: bool

# 15. How to stack two series vertically and horizontally ?

In [36]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

In [37]:
# Vertically

pd.concat([ser1, ser2])



0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [38]:
# Horizontally
pd.concat([ser1, ser2])

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [39]:
pd.concat([ser1, ser2], axis = 1)

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


# 16. How to get the positions of items of series A in another series B?

In [40]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [41]:
[np.where(i == ser1) for i in ser2]

[(array([5], dtype=int64),),
 (array([4], dtype=int64),),
 (array([0], dtype=int64),),
 (array([8], dtype=int64),)]

In [42]:
[np.where(i == ser1)[0][0] for i in ser2]

[5, 4, 0, 8]

# 17. How to compute the mean squared error on a truth and predicted series?

In [43]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

In [44]:
msr = np.mean((truth - pred)**2)

In [45]:
msr

0.3688785478783788

# 18. How to convert the first character of each element in a series to uppercase?

In [46]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [47]:
ser1 = pd.Series([w.capitalize() for w in ser])

ser1

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [48]:
ser.map(lambda x:x.title())

0     How
1      To
2    Kick
3    Ass?
dtype: object

# 19. How to calculate the number of characters in each word in a series?

In [49]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

In [50]:
ser.map(lambda x:len(x))

0    3
1    2
2    4
3    4
dtype: int64

# 20. How to compute difference of differences between consequtive numbers of a series?

In [51]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [52]:
print(ser.diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]


# 33. How to import only every nth row from a csv file to create a dataframe?

In [71]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)

In [59]:
df2 = pd.DataFrame()

for chunk in df:

    df2 = df2.append(chunk.iloc[0])

print(df2.head())

        crim    zn  indus  chas    nox     rm   age     dis  rad    tax  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
50   0.08873  21.0   5.64   0.0  0.439  5.963  45.7  6.8147  4.0  243.0   
100  0.14866   0.0   8.56   0.0  0.520  6.727  79.9  2.7778  5.0  384.0   
150  1.65660   0.0  19.58   0.0  0.871  6.122  97.3  1.6180  5.0  403.0   
200  0.01778  95.0   1.47   0.0  0.403  7.135  13.9  7.6534  3.0  402.0   

     ptratio       b  lstat  medv  
0       15.3  396.90   4.98  24.0  
50      16.8  395.56  13.45  19.7  
100     20.9  394.76   9.42  27.5  
150     14.7  372.80  14.10  21.5  
200     17.0  384.30   4.45  32.9  


In [70]:
df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
df2 = df2.transpose()

In [75]:
import csv

with open('BostonHousing.csv', 'r') as f:
    reader = csv.reader(f)
    out = []
    for i, row in enumerate(reader):
        if i%50 == 0:
            out.append(row)


print(out)
df2 = pd.DataFrame(out[1:], columns=out[0])
print(df2.head())

[['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'medv'], ['0.21977', '0', '6.91', '0', '0.448', '5.602', '62', '6.0877', '3', '233', '17.9', '396.9', '16.2', '19.4'], ['0.0686', '0', '2.89', '0', '0.445', '7.416', '62.5', '3.4952', '2', '276', '18', '396.9', '6.19', '33.2'], ['2.73397', '0', '19.58', '0', '0.871', '5.597', '94.9', '1.5257', '5', '403', '14.7', '351.85', '21.45', '15.4'], ['0.0315', '95', '1.47', '0', '0.403', '6.975', '15.3', '7.6534', '3', '402', '17', '396.9', '4.56', '34.9'], ['0.19073', '22', '5.86', '0', '0.431', '6.718', '17.5', '7.8265', '7', '330', '19.1', '393.74', '6.56', '26.2'], ['0.05561', '70', '2.24', '0', '0.4', '7.041', '10', '7.8278', '5', '358', '14.8', '371.58', '4.74', '29'], ['0.02899', '40', '1.25', '0', '0.429', '6.939', '34.5', '8.7921', '1', '335', '19.7', '389.85', '5.89', '26.6'], ['9.91655', '0', '18.1', '0', '0.693', '5.852', '77.8', '1.5004', '24', '666', '20.2', '338.16', '29.97', '6.3']

# 34. How to change column values when importing csv to a dataframe?

In [115]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=1)


In [116]:
df2 = pd.DataFrame()

for i,chunk in enumerate(df):

    chunk.loc[i,"medv"] = "High" if chunk.loc[i,"medv"] >= 25 else "Low"

    df2 = df2.append(chunk)

  

In [113]:
print(df2)

        crim    zn  indus  chas    nox     rm   age     dis  rad  tax  \
0    0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296   
1    0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242   
2    0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242   
3    0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222   
4    0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...  ...   
501  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786    1  273   
502  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875    1  273   
503  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675    1  273   
504  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889    1  273   
505  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050    1  273   

     ptratio       b  lstat  medv  
0       15.3  396.90   4.98   Low  
1       17.8  396.90   9.14   Low  
2       17.8  3

In [118]:
with open('BostonHousing.csv', 'r') as f:
    reader = csv.reader(f)
    out = []
    for i, row in enumerate(reader):
        if i > 0:
            row[13] = 'High' if float(row[13]) > 25 else 'Low'
        out.append(row)

df = pd.DataFrame(out[1:], columns=out[0])
print(df.head())

      crim  zn indus chas    nox     rm   age     dis rad  tax ptratio  \
0  0.00632  18  2.31    0  0.538  6.575  65.2    4.09   1  296    15.3   
1  0.02731   0  7.07    0  0.469  6.421  78.9  4.9671   2  242    17.8   
2  0.02729   0  7.07    0  0.469  7.185  61.1  4.9671   2  242    17.8   
3  0.03237   0  2.18    0  0.458  6.998  45.8  6.0622   3  222    18.7   
4  0.06905   0  2.18    0  0.458  7.147  54.2  6.0622   3  222    18.7   

        b lstat  medv  
0   396.9  4.98   Low  
1   396.9  9.14   Low  
2  392.83  4.03  High  
3  394.63  2.94  High  
4   396.9  5.33  High  


In [117]:
# Using Converter Parameter

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})

# 36. How to import only specified columns from a csv file?

In [123]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', usecols=["crim", "medv"])
print(df)

        crim  medv
0    0.00632  24.0
1    0.02731  21.6
2    0.02729  34.7
3    0.03237  33.4
4    0.06905  36.2
..       ...   ...
501  0.06263  22.4
502  0.04527  20.6
503  0.06076  23.9
504  0.10959  22.0
505  0.04741  11.9

[506 rows x 2 columns]


In [134]:
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv")
df

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,16.6,19.7,22.7,17.0,21.0,,Front,...,7.0,187.0,115.0,72.0,38.0,34.0,,3960.0,,Volkswagen Eurovan
89,Volkswagen,Passat,Compact,17.6,20.0,22.4,21.0,30.0,,Front,...,5.0,180.0,103.0,67.0,35.0,31.5,14.0,2985.0,non-USA,Volkswagen Passat
90,Volkswagen,Corrado,Sporty,22.9,23.3,23.7,18.0,25.0,,Front,...,4.0,159.0,97.0,66.0,36.0,26.0,15.0,2810.0,non-USA,Volkswagen Corrado
91,Volvo,240,Compact,21.8,22.7,23.5,21.0,28.0,Driver only,Rear,...,5.0,190.0,104.0,67.0,37.0,29.5,14.0,2985.0,non-USA,Volvo 240


In [140]:
#  number of rows and columns
print(df.shape)

# datatypes
print(df.dtypes)

# how many columns under each dtype
print(df.dtypes.value_counts())

# summary statistics
df_stats = df.describe()
print(df_stats)

# numpy array 
df_arr = df.values

# list
df_list = df.values.tolist()



(93, 27)
Manufacturer           object
Model                  object
Type                   object
Min.Price             float64
Price                 float64
Max.Price             float64
MPG.city              float64
MPG.highway           float64
AirBags                object
DriveTrain             object
Cylinders              object
EngineSize            float64
Horsepower            float64
RPM                   float64
Rev.per.mile          float64
Man.trans.avail        object
Fuel.tank.capacity    float64
Passengers            float64
Length                float64
Wheelbase             float64
Width                 float64
Turn.circle           float64
Rear.seat.room        float64
Luggage.room          float64
Weight                float64
Origin                 object
Make                   object
dtype: object
float64    18
object      9
dtype: int64
       Min.Price      Price  Max.Price   MPG.city  MPG.highway  EngineSize  \
count  86.000000  91.000000  88.000000  84.00000

# 38. How to extract the row and column number of a particular cell with given criterion?

In [150]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
# Get Manufacturer with highest price
df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]

# Get Row and Column number
row, col = np.where(df.values == np.max(df.Price))

# Get the value
df.iat[row[0], col[0]]
df.iloc[row[0], col[0]]

# Alternates
df.at[row[0], 'Price']

61.9