The **time.time()** function shows the time (in seconds) since a pre-defined time, which in Unix-based systems

In [44]:
import time

### Formulas, loops and list comprehensions

In [45]:
def sum_sqr(N):
    res = 0
    for i in range(1, N+1):
        res += i**2
    return res

In [5]:
def formula(N):
    return N*(N+1)*(2*N+1)/6

In [21]:
N = 1000000

fm_start_time = time.time()
first_method = formula(N)
print("Time using formula: {} sec".format(time.time() - fm_start_time))

sm_start_time = time.time()
second_method = sum_sqr(N)
print("Time using the sum_sqr: {} sec".format(time.time() - sm_start_time))

Time using formula: 0.0 sec
Time using the sum_sqr: 0.9517776966094971 sec


In [35]:
for_loop_start_time= time.time()
result=[]

for i in range(0,1000000):
    result.append(i*i)

for_loop_end_time= time.time()
print("Time using the for loop: {} sec".format(for_loop_end_time - for_loop_start_time))


list_comp_start_time = time.time()

result = [i*i for i in range(0,1000000)]

list_comp_end_time = time.time()
print("Time using the list_comprehension: {} sec".format(list_comp_end_time - list_comp_start_time))

Time using the for loop: 0.41243910789489746 sec
Time using the list_comprehension: 0.21397638320922852 sec


### Selecting columns and rows

In [29]:
poker_hands = pd.read_csv('poker_hand.csv')
poker_hands

Unnamed: 0,S1,R1,S2,R2,S3,R3,S4,R4,S5,R5,Class
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9
...,...,...,...,...,...,...,...,...,...,...,...
25005,3,9,2,6,4,11,4,12,2,4,0
25006,4,1,4,10,3,13,3,4,1,10,1
25007,2,1,2,10,4,4,4,1,4,13,1
25008,2,12,4,3,1,10,1,12,4,9,1


**loc** is faster for selecting columns and **iloc** for selecting rows.

In [31]:
row_nums = range(0, 1000)

loc_start_time = time.time()
rows = poker_hands.loc[row_nums]
loc_end_time = time.time()
print("Time using .loc[]: {} sec".format(loc_end_time - loc_start_time))


iloc_start_time = time.time()
rows = poker_hands.iloc[row_nums]
iloc_end_time = time.time()
print("Time using .iloc[]: {} sec".format(iloc_end_time-iloc_start_time))

Time using .loc[]: 0.005436897277832031 sec
Time using .iloc[]: 0.002387523651123047 sec


In [36]:
iloc_start_time = time.time()
cols = poker_hands.iloc[:,[0,3,4,6,7]]
iloc_end_time = time.time()
print("Time using .iloc[] : {} sec".format(iloc_end_time - iloc_start_time))


names_start_time = time.time()
cols = poker_hands[['S1', 'S2', 'R2', 'R3', 'S4']]
names_end_time = time.time()
print("Time using selection by name : {} sec".format(names_end_time-names_start_time))

Time using .iloc[] : 0.008677482604980469 sec
Time using selection by name : 0.0027141571044921875 sec


In [37]:
import numpy as np

We will randomly sample the 75% percent of all the played poker hands available.

In [38]:
N=poker_hands.shape[0]

rand_start_time = time.time()
poker_hands.iloc[np.random.randint(low=0, high=N, size=int(0.75 * N))]
print("Time using Numpy: {} sec".format(time.time() - rand_start_time))

samp_start_time = time.time()
poker_hands.sample(int(0.75 * N), axis=0, replace = True)
print("Time using .sample: {} sec".format(time.time() - samp_start_time))

Time using Numpy: 0.08395671844482422 sec
Time using .sample: 0.019383907318115234 sec


In [46]:
D=poker_hands.shape[1]

np_start_time = time.time()
poker_hands.iloc[:,np.random.randint(low=0, high=D, size=6)]
print("Time using NymPy's random.randint(): {} sec".format(time.time() - np_start_time))

pd_start_time = time.time()
poker_hands.sample(6, axis=1)
print("Time using panda's .sample(): {} sec".format(time.time() - pd_start_time))

Time using NymPy's random.randint(): 0.03698229789733887 sec
Time using panda's .sample(): 0.12745070457458496 sec


**.sample** method is faster and efficient than **np.random.randint**

### Replacing values

In [66]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Gender'].loc[names['Gender'] == 'FEMALE'] = 'GIRL'
print("Time using .loc[]: {} sec".format(time.time() - start_time))

Time using .loc[]: 0.0050487518310546875 sec


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [69]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Gender'].replace('FEMALE', 'GIRL', inplace=True)
print("Time using .replace(): {} sec".format(time.time() - start_time))

Time using .replace(): 0.0010426044464111328 sec


In [72]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Ethnicity'].loc[(names["Ethnicity"] == 'BLACK NON HISP') | 
                      (names["Ethnicity"] == 'BLACK NON HISPANIC') | 
                      (names['Ethnicity'] == 'WHITE NON HISP') | 
                      (names['Ethnicity'] == 'WHITE NON HISPANIC')] = 'NON HISPANIC'
print("Time using .loc[]: {} sec".format(time.time() - start_time))

Time using .loc[]: 0.022618532180786133 sec


In [73]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Ethnicity'].replace(['BLACK NON HISP', 'BLACK NON HISPANIC', 'WHITE NON HISP' , 'WHITE NON HISPANIC'], 
                           'NON HISPANIC', inplace=True)
print("Time using .replace(): {} sec".format(time.time() - start_time))

Time using .replace(): 0.0053369998931884766 sec


In [76]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Ethnicity'].replace(['ASIAN AND PACI','BLACK NON HISP', 'WHITE NON HISP'], 
                           ['ASIAN AND PACIFIC ISLANDER','BLACK NON HISPANIC','WHITE NON HISPANIC'], inplace=True)
print("Time using .replace(): {} sec".format(time.time() - start_time))

Time using .replace(): 0.004573822021484375 sec


In [83]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Gender'].replace({'MALE': 'BOY', 'FEMALE': 'GIRL'}, inplace = True)
print("Time using .replace() with dictionary: {} sec".format(time.time() - start_time))

Time using .replace() with dictionary: 0.0021009445190429688 sec


In [84]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Gender'].replace('MALE', 'BOY', inplace = True)
names['Gender'].replace('FEMALE', 'GIRL', inplace = True)

print("Time using multiple .replace(): {} sec".format(time.time() - start_time))

Time using multiple .replace(): 0.007302284240722656 sec


In [91]:
names = pd.read_csv('Popular_Baby_Names.csv')

start_time = time.time()
names['Gender'].replace(['MALE','FEMALE'], ['BOY', 'GIRL'], inplace = True)

print("Time using multiple .replace() list: {} sec".format(time.time() - start_time))

Time using multiple .replace() list: 0.0035784244537353516 sec


In [92]:
names = pd.read_csv('Popular_Baby_Names.csv')

names.replace({'Rank': {1:'MEDAL', 2:'MEDAL', 3:'MEDAL'}}, inplace=True)
names.replace({'Rank': {4:'ALMOST MEDAL', 5:'ALMOST MEDAL'}}, inplace=True)
print(names.head())

   Year of Birth  Gender                   Ethnicity Child's First Name  \
0           2011  FEMALE  ASIAN AND PACIFIC ISLANDER             SOPHIA   
1           2011  FEMALE  ASIAN AND PACIFIC ISLANDER              CHLOE   
2           2011  FEMALE  ASIAN AND PACIFIC ISLANDER              EMILY   
3           2011  FEMALE  ASIAN AND PACIFIC ISLANDER             OLIVIA   
4           2011  FEMALE  ASIAN AND PACIFIC ISLANDER               EMMA   

   Count          Rank  
0    119         MEDAL  
1    106         MEDAL  
2     93         MEDAL  
3     89  ALMOST MEDAL  
4     75  ALMOST MEDAL  


### Iterating through a dataframe

In [93]:
def city_names_generator():
    yield('Xalapa')
    yield('Veracruz')
    yield('CDMX')
    yield('Puebla')
    
city_names = city_names_generator()

In [94]:
next(city_names)
next(city_names)

'Veracruz'

In [95]:
gen = poker_hands.iterrows()
first_element = next(gen)

In [97]:
first_element

(0,
 S1        1
 R1       10
 S2        1
 R2       11
 S3        1
 R3       13
 S4        1
 R4       12
 S5        1
 R5        1
 Class     9
 Name: 0, dtype: int64)

In [99]:
first_element[1]

S1        1
R1       10
S2        1
R2       11
S3        1
R3       13
S4        1
R4       12
S5        1
R5        1
Class     9
Name: 0, dtype: int64

In [100]:
generator = poker_hands.iterrows()

first_element = next(generator)
second_element = next(generator)
print(first_element, second_element)

(0, S1        1
R1       10
S2        1
R2       11
S3        1
R3       13
S4        1
R4       12
S5        1
R5        1
Class     9
Name: 0, dtype: int64) (1, S1        2
R1       11
S2        2
R2       13
S3        2
R3       10
S4        2
R4       12
S5        2
R5        1
Class     9
Name: 1, dtype: int64)


In [101]:
data_generator = poker_hands.iterrows()

for index, values in data_generator:
    if index % 2 != 0:
        hand_sum = sum([values[1], values[3], values[5], values[7], values[9]])

In [102]:
poker_hands

Unnamed: 0,S1,R1,S2,R2,S3,R3,S4,R4,S5,R5,Class
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9
...,...,...,...,...,...,...,...,...,...,...,...
25005,3,9,2,6,4,11,4,12,2,4,0
25006,4,1,4,10,3,13,3,4,1,10,1
25007,2,1,2,10,4,4,4,1,4,13,1
25008,2,12,4,3,1,10,1,12,4,9,1


**apply** faster when we want to iterate through all the **rows** but slower through a column. 

In [103]:
get_square = lambda x: x**2

data_sum = poker_hands.apply(get_square)
print(data_sum.head())

   S1   R1  S2   R2  S3   R3  S4   R4  S5   R5  Class
0   1  100   1  121   1  169   1  144   1    1     81
1   4  121   4  169   4  100   4  144   4    1     81
2   9  144   9  121   9  169   9  100   9    1     81
3  16  100  16  121  16    1  16  169  16  144     81
4  16    1  16  169  16  144  16  121  16  100     81


In [104]:
get_variance = lambda x: np.var(x)

data_tr = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].apply(get_variance, axis = 1)
print(data_tr.head())

0    18.64
1    18.64
2    18.64
3    18.64
4    18.64
dtype: float64


In [105]:
data_tr = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].apply(get_variance, axis=0)
print(data_tr.head())

R1    14.0604731622
R2    14.1895226316
R3    14.0242698666
R4    14.0405519504
R5    13.9988512168
dtype: float64


### Vectorization

<img src = 'vectorization.png'>

In [106]:
row_start_time = time.time()
mean_r = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].mean(axis=1)
print("Time using pandas vectorization for rows: {} sec".format(time.time() - row_start_time))
print(mean_r.head())

col_start_time = time.time()
mean_c = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].mean(axis=0)
print("Time using pandas vectorization for columns: {} sec".format(time.time() - col_start_time))
print(mean_c.head())

Time using pandas vectorization for rows: 0.03026294708251953 sec
0    9.4
1    9.4
2    9.4
3    9.4
4    9.4
dtype: float64
Time using pandas vectorization for columns: 0.0009975433349609375 sec
R1    6.9952419032
R2    7.0141943223
R3    7.0141543383
R4    6.9424630148
R5    6.9627349060
dtype: float64


In [107]:
row_start_time = time.time()
mean_r = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].apply(lambda x: x.mean(),axis=1)
print("Time using pandas apply for rows: {} sec".format(time.time() - row_start_time))
print(mean_r.head())

Time using pandas apply for rows: 1.1069567203521729 sec
0    9.4
1    9.4
2    9.4
3    9.4
4    9.4
dtype: float64


In [108]:
row_start_time = time.time()
mean_r = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].apply(lambda x: x.mean(), axis=0)
print("Time using pandas apply for columns: {} sec".format(time.time() - row_start_time))
print(mean_r.head())

Time using pandas apply for columns: 0.0069408416748046875 sec
R1    6.9952419032
R2    7.0141943223
R3    7.0141543383
R4    6.9424630148
R5    6.9627349060
dtype: float64


#### Vectorization with NumPy

In [110]:
row_start_time = time.time()
mean_rows = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].values.mean(axis=1)
print("Time using pandas vectorization for rows: {} sec".format(time.time() - row_start_time))
print(mean_rows)

Time using pandas vectorization for rows: 0.002992868423461914 sec
[9.4 9.4 9.4 ... 5.8 9.2 7.2]


### Grouping data

In [120]:
restaurant = pd.read_csv('restaurant_data.csv')
restaurant

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [121]:
restaurant_grouped = restaurant.groupby('smoker')
restaurant_grouped.count()

Unnamed: 0_level_0,total_bill,tip,sex,day,time,size
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No,151,151,151,151,151,151
Yes,93,93,93,93,93,93


**Transform**
<img src = 'transform.png'>

In [122]:
z_score = lambda x: (x - x.mean())/x.std()

In [124]:
restaurant_grouped = restaurant.groupby('time')
restaurant_grouped.transform(z_score)

Unnamed: 0,total_bill,tip,size
0,-0.4164457395,-1.4570450379,-0.6928733275
1,-1.1438553648,-1.0044753213,0.4057366332
2,0.0232815828,0.2766451074,0.4057366332
3,0.3153392820,0.1443554979,-0.6928733275
4,0.4148795465,0.3532338287,1.5043465939
...,...,...,...
239,0.9005485294,1.9615969755,0.4057366332
240,0.6981864532,-0.7677465464,-0.6928733275
241,0.2048605269,-0.7677465464,-0.6928733275
242,-0.3256562675,-0.9418118221,-0.6928733275


In [125]:
restaurant_grouped = restaurant.groupby('time').mean()
restaurant_grouped

Unnamed: 0_level_0,total_bill,tip,size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,20.7971590909,3.1026704545,2.6306818182
Lunch,17.1686764706,2.7280882353,2.4117647059


In [126]:
restaurant_grouped = restaurant.groupby('time').std()
restaurant_grouped

Unnamed: 0_level_0,total_bill,tip,size
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,9.1420291525,1.4362428065,0.9102411554
Lunch,7.7138818275,1.2053453798,1.0400243125


In [136]:
#IMPROVEMENT
row_start_time = time.time()

restaurant.groupby('sex').transform(z_score)

print("Time using .groupby(): {} sec".format(time.time() - row_start_time))

Time using .groupby(): 0.014930963516235352 sec


In [144]:
row_start_time = time.time()

mean_female = restaurant.groupby('sex')['total_bill'].mean()['Female'] 
#==restaurant.groupby('sex').mean()['total_bill']['Female']
mean_male = restaurant.groupby('sex')['total_bill'].mean()['Male'] 
std_female = restaurant.groupby('sex')['total_bill'].std()['Female'] 
std_male = restaurant.groupby('sex')['total_bill'].std()['Male'] 

for i in range(len(restaurant)):
    if restaurant.iloc[i][2] == 'Female':
        restaurant.iloc[i][0] = (restaurant.iloc[i][0]-mean_female)/std_female
    else:
        restaurant.iloc[i][0] = (restaurant.iloc[i][0]-mean_male)/std_male

print("Time using native pandas: {} sec".format(time.time() - row_start_time))

Time using native pandas: 0.18157076835632324 sec


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [147]:
100*(0.18157076835632324 - 0.014930963516235352)/0.014930963516235352

1116.0686626746508

In [149]:
min_max_tr = lambda x: (x - x.min()) / (x.max() - x.min())
restaurant_grouped = restaurant.groupby('time')
restaurant_min_max_group = restaurant_grouped.transform(min_max_tr)
print(restaurant_min_max_group.head())

     total_bill           tip  size
0  0.2915793884  0.0011111111   0.2
1  0.1522832007  0.0733333333   0.4
2  0.3757855048  0.2777777778   0.4
3  0.4317134478  0.2566666667   0.2
4  0.4507750314  0.2900000000   0.6


In [150]:
exp_tr = lambda x: np.exp(-x.mean()*x) * x.mean()
restaurant_grouped = restaurant.groupby('time')
restaurant_exp_group = restaurant_grouped['tip'].transform(exp_tr)
print(restaurant_exp_group.head())

0    0.1351413540
1    0.0179858530
2    0.0000596460
3    0.0001075477
4    0.0000423994
Name: tip, dtype: float64


#### Filter

In [155]:
restaurant_grouped = restaurant.groupby('day')
filter_trans = lambda x: x['total_bill'].mean() > 20
restaurant_filtered = restaurant_grouped.filter(filter_trans)
restaurant_filtered

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [156]:
restaurant_filtered['day'].value_counts()

Sat    87
Sun    76
Name: day, dtype: int64

In [154]:
restaurant.groupby('day').mean()

Unnamed: 0_level_0,total_bill,tip,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,17.1515789474,2.7347368421,2.1052631579
Sat,20.4413793103,2.9931034483,2.5172413793
Sun,21.41,3.2551315789,2.8421052632
Thur,17.6827419355,2.7714516129,2.4516129032


In [157]:
# Filter the days where the count of total_bill is greater than $40
total_bill_40 = restaurant.groupby('day').filter(lambda x: x['total_bill'].count() > 40)
print('Number of tables where total_bill is greater than $40:', total_bill_40.shape[0])

Number of tables where total_bill is greater than $40: 225


In [158]:
# Select only the entries that have a mean total_bill greater than $20
total_bill_20 = total_bill_40.groupby('day').filter(lambda x : x['total_bill'].mean() > 20)
print('Days of the week that have a mean total_bill greater than $20:', total_bill_20.day.unique())

Days of the week that have a mean total_bill greater than $20: ['Sun' 'Sat']


In [159]:
total_bill_20.shape

(163, 7)