In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Handling Missing Values

In [3]:
string_data = pd.Series(['aardvark', 'artichock', np.nan, 'avocado'])
string_data

0     aardvark
1    artichock
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [11]:
# Forward fill
string_data.ffill()

0     aardvark
1    artichock
2    artichock
3      avocado
dtype: object

In [13]:
# Backward fill
string_data.bfill()

0     aardvark
1    artichock
2      avocado
3      avocado
dtype: object

In [14]:
string_data.dropna()

0     aardvark
1    artichock
3      avocado
dtype: object

In [15]:
from numpy import nan as NA

In [16]:
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [17]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [21]:
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [22]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [24]:
data.dropna(axis = 1, how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.232103,0.019635,0.26781
1,-0.856861,0.770291,0.116512
2,1.216718,1.170547,1.076909
3,-0.538052,1.036629,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [26]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

df

Unnamed: 0,0,1,2
0,0.232103,,
1,-0.856861,,
2,1.216718,,1.076909
3,-0.538052,,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [27]:
df.dropna()

Unnamed: 0,0,1,2
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [28]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,1.216718,,1.076909
3,-0.538052,,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


# Filling In Missing Data

In [29]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [30]:
df

Unnamed: 0,0,1,2
0,0.232103,,
1,-0.856861,,
2,1.216718,,1.076909
3,-0.538052,,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [31]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.232103,0.0,0.0
1,-0.856861,0.0,0.0
2,1.216718,0.0,1.076909
3,-0.538052,0.0,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [32]:
data.fillna(1)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,6.5,3.0,1.0


In [33]:
data.fillna(7)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,7.0
1,1.0,7.0,7.0,7.0
2,7.0,7.0,7.0,7.0
3,7.0,6.5,3.0,7.0


In [34]:
np.mean(data)

0    1.0
1    6.5
2    3.0
4    NaN
dtype: float64

In [37]:
data.fillna(np.mean(data))

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,6.5,3.0,
2,1.0,6.5,3.0,
3,1.0,6.5,3.0,


In [35]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [36]:
print(df)
df.fillna({1: 0.5, 2: 0})

          0         1         2
0  0.232103       NaN       NaN
1 -0.856861       NaN       NaN
2  1.216718       NaN  1.076909
3 -0.538052       NaN -0.270283
4  0.178827  0.646211  0.266209
5  0.218063 -0.541750  0.034779
6 -1.199640  1.765948 -0.170857


Unnamed: 0,0,1,2
0,0.232103,0.5,0.0
1,-0.856861,0.5,0.0
2,1.216718,0.5,1.076909
3,-0.538052,0.5,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [38]:
df

Unnamed: 0,0,1,2
0,0.232103,,
1,-0.856861,,
2,1.216718,,1.076909
3,-0.538052,,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [39]:
df.fillna(0, inplace = True)

In [40]:
df

Unnamed: 0,0,1,2
0,0.232103,0.0,0.0
1,-0.856861,0.0,0.0
2,1.216718,0.0,1.076909
3,-0.538052,0.0,-0.270283
4,0.178827,0.646211,0.266209
5,0.218063,-0.54175,0.034779
6,-1.19964,1.765948,-0.170857


In [41]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA

df

Unnamed: 0,0,1,2
0,-0.408528,0.970634,0.897874
1,0.062746,-0.45829,0.998983
2,1.903408,,0.257876
3,-2.525647,,-2.75149
4,-1.323219,,
5,-1.090989,,


In [42]:
df.fillna(method = "ffill")

Unnamed: 0,0,1,2
0,-0.408528,0.970634,0.897874
1,0.062746,-0.45829,0.998983
2,1.903408,-0.45829,0.257876
3,-2.525647,-0.45829,-2.75149
4,-1.323219,-0.45829,-2.75149
5,-1.090989,-0.45829,-2.75149


In [43]:
df.fillna(method = "bfill")

Unnamed: 0,0,1,2
0,-0.408528,0.970634,0.897874
1,0.062746,-0.45829,0.998983
2,1.903408,,0.257876
3,-2.525647,,-2.75149
4,-1.323219,,
5,-1.090989,,


In [44]:
print(df)
df.fillna(method = "ffill", axis = 1)

          0         1         2
0 -0.408528  0.970634  0.897874
1  0.062746 -0.458290  0.998983
2  1.903408       NaN  0.257876
3 -2.525647       NaN -2.751490
4 -1.323219       NaN       NaN
5 -1.090989       NaN       NaN


Unnamed: 0,0,1,2
0,-0.408528,0.970634,0.897874
1,0.062746,-0.45829,0.998983
2,1.903408,1.903408,0.257876
3,-2.525647,-2.525647,-2.75149
4,-1.323219,-1.323219,-1.323219
5,-1.090989,-1.090989,-1.090989


In [45]:
df.fillna(method = "ffill", limit = 2)

Unnamed: 0,0,1,2
0,-0.408528,0.970634,0.897874
1,0.062746,-0.45829,0.998983
2,1.903408,-0.45829,0.257876
3,-2.525647,-0.45829,-2.75149
4,-1.323219,,-2.75149
5,-1.090989,,-2.75149


In [47]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [49]:
print(df)
df.iloc[:, 1]

          0         1         2
0 -0.408528  0.970634  0.897874
1  0.062746 -0.458290  0.998983
2  1.903408       NaN  0.257876
3 -2.525647       NaN -2.751490
4 -1.323219       NaN       NaN
5 -1.090989       NaN       NaN


0    0.970634
1   -0.458290
2         NaN
3         NaN
4         NaN
5         NaN
Name: 1, dtype: float64

# Data Transformation

# Removing Duplicates

In [50]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [52]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [54]:
data[data.duplicated()]

Unnamed: 0,k1,k2
6,two,4


In [55]:
data[~data.duplicated()]

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [56]:
# dropduplicate based on specific column or columns
data.drop_duplicates(subset = "k1")

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [57]:
# dropduplicate based on specific column or columns
data.drop_duplicates(subset = "k1", keep = 'last')

Unnamed: 0,k1,k2
4,one,3
6,two,4


In [58]:
# dropduplicate based on specific column or columns
data.drop_duplicates(subset = "k2")

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [59]:
# dropduplicate based on specific column or columns
data.drop_duplicates(subset = "k2", keep = 'last')

Unnamed: 0,k1,k2
1,two,1
2,one,2
4,one,3
6,two,4


In [60]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
6,two,4


# Transforming Data Using a Function or Mapping

In [61]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                     'Pastrami', 'corned beef', 'Bacon',
                     'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [62]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [64]:
data['animal'] = data.food.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,
4,corned beef,7.5,cow
5,Bacon,8.0,
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [65]:
lowercased = data['food'].str.lower()

lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [66]:
data['animal'] = lowercased.map(meat_to_animal)

data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [67]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

# Replacing Values

In [68]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [69]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [70]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [71]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [72]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# Renaming Axis Indexes

In [78]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [82]:
transform = lambda x: x[:4].upper()

In [84]:
data.index.map(transform)
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [86]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [87]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [88]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [89]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# Discretization and Binning

In [90]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [91]:
bins = [18, 25, 35, 60, 100]

In [92]:
cats = pd.cut(ages, bins)

cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [93]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [95]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [96]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [97]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [98]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [99]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [101]:
data = np.random.rand(20)
data

array([0.27081696, 0.53133454, 0.18046478, 0.6173507 , 0.99847508,
       0.50669737, 0.95336384, 0.210791  , 0.19979748, 0.69937695,
       0.32053108, 0.73906587, 0.84014965, 0.69668135, 0.81654916,
       0.3244397 , 0.69322651, 0.38974041, 0.58793429, 0.89836309])

In [102]:
pd.cut(data, 4, precision=2)

[(0.18, 0.38], (0.38, 0.59], (0.18, 0.38], (0.59, 0.79], (0.79, 1.0], ..., (0.18, 0.38], (0.59, 0.79], (0.38, 0.59], (0.38, 0.59], (0.79, 1.0]]
Length: 20
Categories (4, interval[float64]): [(0.18, 0.38] < (0.38, 0.59] < (0.59, 0.79] < (0.79, 1.0]]

In [104]:
data = np.random.randn(1000) # Normally distributed
data

array([ 1.31482027e+00, -6.50318774e-01,  3.49580160e-02,  4.70826252e-01,
        1.36583537e-01, -2.26592150e+00,  6.71848980e-01,  8.47569737e-01,
       -1.05738852e+00, -1.04778954e+00, -4.64043267e-01,  8.27663504e-01,
        3.10712420e-01, -1.04165494e-01, -3.56636945e-01,  1.01357741e+00,
        6.09193769e-01, -2.86696044e+00,  1.00136713e+00,  1.69777909e-01,
       -4.06609916e-01, -5.61524295e-01, -8.28550716e-01, -1.40839509e-01,
        1.48545201e+00,  1.51486991e-01,  3.77329248e-01,  3.47419413e-01,
        1.15784826e+00,  1.78555443e+00, -2.00645975e-02,  1.29681456e+00,
       -1.28277429e+00, -1.00929912e+00, -1.15189083e+00, -6.38829862e-01,
        2.11858419e+00,  9.18435021e-01, -4.52538986e-01, -1.72081507e-01,
       -2.24763374e+00,  6.93653867e-01,  8.27689600e-02, -1.68724422e+00,
        1.02967305e+00, -7.60434669e-01, -3.55225942e-01,  1.78404341e-01,
        6.52190260e-01, -1.33362102e+00, -1.24543670e+00,  3.43649787e-01,
        1.05006120e+00,  

In [105]:
cats = pd.qcut(data, 4) # Cut into quartiles

cats

[(0.642, 3.399], (-0.662, -0.0252], (-0.0252, 0.642], (-0.0252, 0.642], (-0.0252, 0.642], ..., (-0.0252, 0.642], (-0.662, -0.0252], (0.642, 3.399], (0.642, 3.399], (0.642, 3.399]]
Length: 1000
Categories (4, interval[float64]): [(-3.374, -0.662] < (-0.662, -0.0252] < (-0.0252, 0.642] < (0.642, 3.399]]

In [106]:
pd.value_counts(cats)

(0.642, 3.399]       250
(-0.0252, 0.642]     250
(-0.662, -0.0252]    250
(-3.374, -0.662]     250
dtype: int64

In [107]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(1.254, 3.399], (-1.309, -0.0252], (-0.0252, 1.254], (-0.0252, 1.254], (-0.0252, 1.254], ..., (-0.0252, 1.254], (-1.309, -0.0252], (-0.0252, 1.254], (1.254, 3.399], (-0.0252, 1.254]]
Length: 1000
Categories (4, interval[float64]): [(-3.374, -1.309] < (-1.309, -0.0252] < (-0.0252, 1.254] < (1.254, 3.399]]

In [126]:
#pd.qcut(data, [0, 0.1, .5, .75, 1]).value_counts()

# Detecting and Filtering Outliers

In [108]:
data = pd.DataFrame(np.random.randn(1000, 4))

data

Unnamed: 0,0,1,2,3
0,0.566559,0.534897,-0.155819,0.964928
1,0.870055,-0.973266,0.645954,-1.259124
2,0.614331,1.278345,-1.151889,2.678617
3,1.477248,2.132959,0.301303,-1.134538
4,-2.047969,0.023393,0.996029,1.856201
...,...,...,...,...
995,-0.493630,1.513193,1.327980,-0.240104
996,1.071681,0.313604,-0.689844,-0.326424
997,-0.349773,0.263213,0.457165,0.125471
998,-0.275843,1.543439,0.419522,0.735591


In [109]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.000346,-0.025835,0.024996,0.01541
std,0.983106,0.998489,1.020113,1.016653
min,-2.903001,-2.966199,-3.248157,-2.632604
25%,-0.684547,-0.708893,-0.655505,-0.673582
50%,-0.026576,-0.027183,0.0149,-0.021315
75%,0.689995,0.673514,0.669147,0.71378
max,3.088498,2.979134,3.083746,3.045594


In [111]:
col = data[2]
col

0     -0.155819
1      0.645954
2     -1.151889
3      0.301303
4      0.996029
         ...   
995    1.327980
996   -0.689844
997    0.457165
998    0.419522
999   -1.800358
Name: 2, Length: 1000, dtype: float64

In [112]:
col[np.abs(col) > 3]

131   -3.248157
450    3.030949
665    3.064259
991    3.083746
Name: 2, dtype: float64

In [113]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
48,-0.74718,1.032582,1.101519,3.045594
131,-0.131135,-0.209751,-3.248157,-0.679119
450,-1.392792,1.030455,3.030949,-0.853681
665,0.365338,-0.141474,3.064259,-0.41892
874,3.088498,-0.793558,-0.92925,1.309089
991,-0.572753,1.382217,3.083746,0.963626


In [129]:
np.array([True, True, False]).any()

True

In [130]:
np.array([True, True, False]).all()

False

In [115]:
data[np.abs(data) > 3] = np.sign(data) * 3

data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.000257,-0.025835,0.025065,0.015364
std,0.982831,0.998489,1.018817,1.016518
min,-2.903001,-2.966199,-3.0,-2.632604
25%,-0.684547,-0.708893,-0.655505,-0.673582
50%,-0.026576,-0.027183,0.0149,-0.021315
75%,0.689995,0.673514,0.669147,0.71378
max,3.0,2.979134,3.0,3.0


In [116]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,-1.0,1.0,-1.0
2,1.0,1.0,-1.0,1.0
3,1.0,1.0,1.0,-1.0
4,-1.0,1.0,1.0,1.0


# Permutation and Random Sampling

In [117]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [118]:
sampler = np.random.permutation(5)

sampler

array([4, 3, 1, 2, 0])

In [119]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [120]:
df.take(sampler)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3


In [121]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3


In [123]:
choices = pd.Series([5, 7, -1, 6, 4])

choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [124]:
draws = choices.sample(n=10, replace=True)

draws

4    4
0    5
1    7
4    4
1    7
2   -1
4    4
4    4
1    7
2   -1
dtype: int64

# Computing Indicator/Dummy Variables

In [131]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [132]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [133]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [134]:
df_with_dummy = df[['data1']].join(dummies)

df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [136]:
mnames = ['movie_id', 'title', 'genres']

mnames

['movie_id', 'title', 'genres']

In [139]:
movies = pd.read_table('datasets/one_hote_encode.xlsx', sep='::',
                        header=None, names=mnames)

  


In [142]:
a = pd.Series(['A','B','C','A','B','C'])
pd.get_dummies(a)

Unnamed: 0,A,B,C
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0
5,0,0,1
