In [58]:
import numpy as np
import pandas as pd

In [7]:
# 7.1 Handling Missing Data
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [8]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [10]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [11]:
# Filtering Out Missing Data
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()

In [14]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [17]:
# drop columns, axis=1 
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [19]:
data.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
# parameter:thresh,一行中的数据达到thresh个，得以保留。
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.929348,,
1,-0.412974,,
2,0.663032,,1.403204
3,-0.968264,,0.077795
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.663032,,1.403204
3,-0.968264,,0.077795
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [23]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [24]:
df.dropna(thresh=4)

Unnamed: 0,0,1,2


In [25]:
# Filling In Missing Data
df.fillna(0)

Unnamed: 0,0,1,2
0,0.929348,0.0,0.0
1,-0.412974,0.0,0.0
2,0.663032,0.0,1.403204
3,-0.968264,0.0,0.077795
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [26]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.929348,0.5,0.0
1,-0.412974,0.5,0.0
2,0.663032,0.5,1.403204
3,-0.968264,0.5,0.077795
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [27]:
_ = df.fillna(0, inplace=True)

In [28]:
df

Unnamed: 0,0,1,2
0,0.929348,0.0,0.0
1,-0.412974,0.0,0.0
2,0.663032,0.0,1.403204
3,-0.968264,0.0,0.077795
4,-1.805809,-0.248428,-0.154059
5,0.808424,-0.587241,0.094168
6,0.229486,0.12394,1.45261


In [29]:
_

In [30]:
print(_)

None


In [31]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.190691,1.216468,0.357943
1,1.253739,0.408205,1.537336
2,-0.113966,,0.049973
3,-1.276673,,0.550789
4,0.701288,,
5,-0.120101,,
6,-0.498196,,


In [32]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.190691,1.216468,0.357943
1,1.253739,0.408205,1.537336
2,-0.113966,0.408205,0.049973
3,-1.276673,0.408205,0.550789
4,0.701288,0.408205,0.550789
5,-0.120101,0.408205,0.550789
6,-0.498196,0.408205,0.550789


In [33]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.190691,1.216468,0.357943
1,1.253739,0.408205,1.537336
2,-0.113966,0.408205,0.049973
3,-1.276673,0.408205,0.550789
4,0.701288,,0.550789
5,-0.120101,,0.550789
6,-0.498196,,


In [35]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [36]:
# 7.2 Data Transformation
# Removing Duplicates
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [37]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [38]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [39]:
data[~data.duplicated()]

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [40]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [41]:
data.drop_duplicates(['k1', 'k2'], keep='last')  # 保留后面的行，即5，6保留6

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [42]:
# Transforming Data Using a Function or Mapping
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [43]:
"""use map method including dict or function"""
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [44]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [45]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [46]:
# Replacing Values
# fillna method is a special method of replace method
# pass a scalar, a list or a dict

'use map method including dict or function'

In [47]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [48]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [49]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [50]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [51]:
# Renaming Axis Indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [52]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [60]:
data.index = data.index.map(transform)
data.index

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [55]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [56]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [57]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [58]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [61]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [62]:
# Discretization and Binning
# cut method
# qcut method
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [63]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [64]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [65]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [66]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [67]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [68]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)  # 4分法，小数保留两位

[(0.21, 0.43], (0.21, 0.43], (0.21, 0.43], (0.0025, 0.21], (0.21, 0.43], ..., (0.43, 0.64], (0.64, 0.85], (0.43, 0.64], (0.0025, 0.21], (0.21, 0.43]]
Length: 20
Categories (4, interval[float64]): [(0.0025, 0.21] < (0.21, 0.43] < (0.43, 0.64] < (0.64, 0.85]]

In [71]:
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats

[(0.65, 3.392], (-0.0164, 0.65], (-2.794, -0.714], (0.65, 3.392], (-2.794, -0.714], ..., (0.65, 3.392], (-0.0164, 0.65], (-0.714, -0.0164], (-0.0164, 0.65], (0.65, 3.392]]
Length: 1000
Categories (4, interval[float64]): [(-2.794, -0.714] < (-0.714, -0.0164] < (-0.0164, 0.65] < (0.65, 3.392]]

In [72]:
cats = pd.qcut(data, 4, precision=2) # Cut into quartiles
cats

[(0.65, 3.39], (-0.016, 0.65], (-2.8, -0.71], (0.65, 3.39], (-2.8, -0.71], ..., (0.65, 3.39], (-0.016, 0.65], (-0.71, -0.016], (-0.016, 0.65], (0.65, 3.39]]
Length: 1000
Categories (4, interval[float64]): [(-2.8, -0.71] < (-0.71, -0.016] < (-0.016, 0.65] < (0.65, 3.39]]

In [73]:
pd.value_counts(cats)

(0.65, 3.39]       250
(-0.016, 0.65]     250
(-0.71, -0.016]    250
(-2.8, -0.71]      250
dtype: int64

In [76]:
temp = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

In [79]:
pd.value_counts(temp)

(-0.0164, 1.253]    400
(-1.29, -0.0164]    400
(1.253, 3.392]      100
(-2.794, -1.29]     100
dtype: int64

In [93]:
# Detecting and Filtering Outliers
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.030662,-0.001993,-0.035144,-0.011087
std,0.99839,0.996799,1.003187,0.980791
min,-3.246394,-3.004019,-2.716265,-3.494682
25%,-0.712566,-0.657302,-0.699172,-0.655572
50%,-0.058537,0.002183,-0.059497,-0.02513
75%,0.629251,0.673454,0.589309,0.669318
max,3.293703,3.092972,3.751103,3.199166


In [94]:
col = data[2]
col[np.abs(col) > 3]

167    3.751103
214    3.161517
Name: 2, dtype: float64

In [82]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
48,2.134366,-0.208422,3.355546,-0.778349
141,-4.566741,-1.25727,-0.391685,1.105109
152,-3.738247,-0.631972,-1.114753,-0.12377
284,-0.19138,3.461361,0.825381,0.559939
358,1.495551,-1.665682,-3.196522,-0.677205
403,1.236493,-0.446016,-3.261178,0.086757
409,-0.267117,0.202546,-3.667574,-0.954698
522,3.639041,0.73817,-0.230025,-1.532283
766,0.481101,-1.37493,-0.743152,-3.183425
843,0.202309,0.711334,-0.985346,-3.177326


In [96]:
# data[(np.abs(data) > 3)]

In [83]:
(np.abs(data) > 3).any(1)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
970    False
971    False
972    False
973    False
974    False
975    False
976    False
977    False
978    False
979    False
980    False
981    False
982    False
983    False
984    False
985    False
986    False
987    False
988    False
989    False
990    False
991    False
992    False
993    False
994    False
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [84]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.000803,0.013525,-0.010937,-0.032231
std,0.988994,0.976364,0.983886,0.991482
min,-3.0,-2.941584,-3.0,-3.0
25%,-0.622456,-0.587342,-0.691719,-0.699446
50%,-0.036124,0.012557,-0.006348,-0.036464
75%,0.663704,0.666601,0.652224,0.607638
max,3.0,3.0,3.0,2.901844


In [97]:
print((np.sign(data) * 3).shape)
print((data[np.abs(data) > 3]).shape)
# print(data[np.abs(data) > 3])

(1000, 4)
(1000, 4)


In [88]:
data

Unnamed: 0,0,1,2,3
0,-0.273198,-0.192183,-0.559510,-0.790570
1,-1.253808,-0.784671,0.316433,-0.208100
2,0.776127,-0.499586,-0.082065,-0.324389
3,1.469419,-0.345910,0.651658,0.042106
4,-0.061625,0.216802,-1.875108,-0.357697
5,-0.419913,0.392506,1.179708,-0.688715
6,0.255296,1.134508,0.427461,1.712952
7,1.070815,0.985812,1.158594,0.792437
8,0.052171,1.384533,0.292502,0.356570
9,-1.381999,0.150412,-1.188431,-0.404949


In [102]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
70,1.364042,-0.994601,0.183229,-3.298376
167,0.306301,0.094748,3.751103,0.943994
214,-0.4503,0.016445,3.161517,-0.010312
227,-3.246394,1.330542,-0.321512,0.244616
295,-2.089174,3.092972,0.587514,-1.118587
309,-3.238205,-0.719524,-1.307524,-1.032742
567,0.478694,3.076449,-0.583812,1.375362
590,-0.434591,0.769585,-0.555546,-3.228588
633,-0.988605,-2.700658,-0.158713,3.199166
634,1.516062,0.272923,1.364555,-3.494682


In [99]:
# data[np.abs(data) > 3]

In [103]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,-1.0,1.0,1.0,1.0
2,-1.0,-1.0,-1.0,1.0
3,-1.0,1.0,1.0,-1.0
4,-1.0,1.0,-1.0,-1.0


In [2]:
# Permutation and Random Sampling
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

array([3, 1, 2, 4, 0])

In [3]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [4]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3


In [5]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
0,0,1,2,3


In [6]:
# sample() allow repeat
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

1    7
0    5
4    4
2   -1
2   -1
4    4
4    4
3    6
0    5
3    6
dtype: int64

In [8]:
# Computing Indicator/Dummy Variables
# get_dummies(): one-hot encoding 一列
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [9]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [10]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [11]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [3]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)
movies[:10]  

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [5]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [6]:
gen = movies.genres[0]
gen.split("|")

['Animation', "Children's", 'Comedy']

In [7]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

In [8]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i,indices] = 1
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [10]:
movies_windic.iloc[:10]

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children's,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [12]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           0           0           0           0           1
1           0           1           0           0           0
2           1           0           0           0           0
3           0           1           0           0           0
4           0           0           1           0           0
5           0           0           1           0           0
6           0           0           0           0           1
7           0           0           0           1           0
8           0           0           0           1           0
9           0           0           0           1           0

In [14]:
pd.cut(values, bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [15]:
# 7.3 String Manipulation
# String Object Methods
# split(), strip()
# join(), in, index(), find()
# Note the difference between index and find is that index raises an exception if the
# string isn’t found (versus returning –1):
# index(), find() 区别在于index()没找到返回ValueError: substring not found
# 而find()返回-1
# count()
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [16]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [17]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [19]:
'::'.join(pieces)

'a::b::guido'

In [20]:
'guido' in val

True

In [27]:
val.index(',')

1

In [30]:
val.index(':')

ValueError: substring not found

In [28]:
val.find(',')

1

In [31]:
val.find(':')

-1

In [33]:
val.count(',')

2

In [34]:
val.replace(',', '::')

'a::b::  guido'

In [35]:
val.replace(',', '')

'ab  guido'

In [36]:
# Regular Expressions
# import re
# pattern matching, substitution, and splitting.
# 先确定模式，再用来匹配，查找，替换或者分隔操作。
# raw string literals like r'C:\x' instead of the equivalent 'C:\\x'.
# re.compile(), re.match(), re.search(), re.findall(), re.replace(), re.split()

In [53]:
import re
text = "foo bar\t baz \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [38]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [39]:
regex.findall(text)

[' ', '\t ', ' \t']

In [43]:
regex.search(text)

<re.Match object; span=(3, 4), match=' '>

In [54]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [45]:
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [46]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [47]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [49]:
m.span()  # str

(5, 20)

In [50]:
text[m.start():m.end()]

'dave@google.com'

In [51]:
print(regex.match(text))  # 因为space没包含在pattern中

None


In [55]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [56]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
# username@domain name.domain suffix
regex = re.compile(pattern, flags=re.IGNORECASE)

In [59]:
m = regex.match('wesm@bright.net')
m.group()

'wesm@bright.net'

In [60]:
m.groups()

('wesm', 'bright', 'net')

In [61]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [62]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [63]:
regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)

'Dave Username: dave, Domain: google, Suffix: com\nSteve Username: steve, Domain: gmail, Suffix: com\nRob Username: rob, Domain: gmail, Suffix: com\nRyan Username: ryan, Domain: yahoo, Suffix: com\n'

In [65]:
# Vectorized String Functions in pandas
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [66]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [67]:
# use pd.str
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [68]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [69]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [70]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [71]:
# use df.str only in str 
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [72]:
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [73]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [76]:
data.str[0]

Dave       d
Steve      s
Rob        r
Wes      NaN
dtype: object

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64