## useful methods

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv('tips.csv')

In [4]:
str(122345678)[-4:]  # extracting last 4 digits using slicing in python

'5678'

In [5]:
# Pandas.apply allow the users to pass a function and apply it on every single value of the Pandas series.
def round_num(num):
    return round(num)

In [6]:
df['roundOff']=df['total_bill'].apply(round_num) # here apply func. should return a single value(since, have to put in 1 col )

In [7]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff
0,16.99,1.01,Female,No,Sun,Dinner,2,17
1,10.34,1.66,Male,No,Sun,Dinner,3,10
2,21.01,3.50,Male,No,Sun,Dinner,3,21
3,23.68,3.31,Male,No,Sun,Dinner,2,24
4,24.59,3.61,Female,No,Sun,Dinner,4,25
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,29
240,27.18,2.00,Female,Yes,Sat,Dinner,2,27
241,22.67,2.00,Male,Yes,Sat,Dinner,2,23
242,17.82,1.75,Male,No,Sat,Dinner,2,18


In [8]:
#  adding reviews like how costlier the restaurant is
def review(price):
    if price<10:
        return '$'
    elif price>=10 and price<30:
        return '$$'
    else:
        return '$$$'

In [9]:
df['costly']=df['total_bill'].apply(review)

In [10]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff,costly
0,16.99,1.01,Female,No,Sun,Dinner,2,17,$$
1,10.34,1.66,Male,No,Sun,Dinner,3,10,$$
2,21.01,3.50,Male,No,Sun,Dinner,3,21,$$
3,23.68,3.31,Male,No,Sun,Dinner,2,24,$$
4,24.59,3.61,Female,No,Sun,Dinner,4,25,$$
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,29,$$
240,27.18,2.00,Female,Yes,Sat,Dinner,2,27,$$
241,22.67,2.00,Male,Yes,Sat,Dinner,2,23,$$
242,17.82,1.75,Male,No,Sat,Dinner,2,18,$$


In [11]:
df[df['costly']=='$$$']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff,costly
11,35.26,5.0,Female,No,Sun,Dinner,4,35,$$$
23,39.42,7.58,Male,No,Sat,Dinner,4,39,$$$
39,31.27,5.0,Male,No,Sat,Dinner,3,31,$$$
44,30.4,5.6,Male,No,Sun,Dinner,4,30,$$$
47,32.4,6.0,Male,No,Sun,Dinner,4,32,$$$
52,34.81,5.2,Female,No,Sun,Dinner,4,35,$$$
56,38.01,3.0,Male,Yes,Sat,Dinner,4,38,$$$
59,48.27,6.73,Male,No,Sat,Dinner,4,48,$$$
83,32.68,5.0,Male,Yes,Thur,Lunch,2,33,$$$
85,34.83,5.17,Female,No,Thur,Lunch,4,35,$$$


In [12]:
#  custom functions using multiple columns as inputs

In [13]:
# using lambda function
df['total_bill'].apply(lambda bill: bill*2)

0      33.98
1      20.68
2      42.02
3      47.36
4      49.18
       ...  
239    58.06
240    54.36
241    45.34
242    35.64
243    37.56
Name: total_bill, Length: 244, dtype: float64

In [14]:
def quality(total_bill,tip):
    if tip/total_bill > 0.25:
        return "Generous"
    else:
        return "Other"

In [15]:
quality(16.99,1.01)

'Other'

In [16]:
df['Quality']=df[['total_bill','tip']].apply(lambda df: quality(df['total_bill'],df['tip']),axis=1)

In [17]:
df['Quality']=np.vectorize(quality)(df['total_bill'],df['tip']) 
# .vectorize is making func. quality to be aware of numpy function & hence can be used as numpy func. like mean

In [18]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff,costly,Quality
0,16.99,1.01,Female,No,Sun,Dinner,2,17,$$,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,10,$$,Other
2,21.01,3.50,Male,No,Sun,Dinner,3,21,$$,Other
3,23.68,3.31,Male,No,Sun,Dinner,2,24,$$,Other
4,24.59,3.61,Female,No,Sun,Dinner,4,25,$$,Other
...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,29,$$,Other
240,27.18,2.00,Female,Yes,Sat,Dinner,2,27,$$,Other
241,22.67,2.00,Male,Yes,Sat,Dinner,2,23,$$,Other
242,17.82,1.75,Male,No,Sat,Dinner,2,18,$$,Other


In [19]:
# comparing efficiency
import timeit

In [20]:
# code snippet to be executed once
setup = '''
import numpy as np
import pandas as pd
df=pd.read_csv('tips.csv')
def quality(total_bill,tip):
   if tip/total_bill > 0.25:
        return "Generous"
   else:
        return "Other"
'''

In [21]:
#  code snippet whose execution time is to be measured
stmt_one='''
df['tip_quality']=df[['total_bill','tip']].apply(lambda df:quality(df['total_bill'],df['tip']),axis=1)
'''

stmt_two='''
df['tip_quality']=np.vectorize(quality)(df['total_bill'],df['tip'])
'''

In [22]:
timeit.timeit(stmt=stmt_two,setup=setup,number=1000)

0.15499359999995477

In [23]:
timeit.timeit(stmt=stmt_one,setup=setup,number=1000)

1.4373065000000338

## Statistical inf. & sorting

In [24]:
#  sorting 
df.sort_values('tip')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff,costly,Quality
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3,$,Generous
236,12.60,1.00,Male,Yes,Sat,Dinner,2,13,$$,Other
92,5.75,1.00,Female,Yes,Fri,Dinner,2,6,$,Other
111,7.25,1.00,Female,No,Sat,Dinner,1,7,$,Other
0,16.99,1.01,Female,No,Sun,Dinner,2,17,$$,Other
...,...,...,...,...,...,...,...,...,...,...
141,34.30,6.70,Male,No,Thur,Lunch,6,34,$$$,Other
59,48.27,6.73,Male,No,Sat,Dinner,4,48,$$$,Other
23,39.42,7.58,Male,No,Sat,Dinner,4,39,$$$,Other
212,48.33,9.00,Male,No,Sat,Dinner,4,48,$$$,Other


In [25]:
df.sort_values('tip',ascending=False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff,costly,Quality
170,50.81,10.00,Male,Yes,Sat,Dinner,3,51,$$$,Other
212,48.33,9.00,Male,No,Sat,Dinner,4,48,$$$,Other
23,39.42,7.58,Male,No,Sat,Dinner,4,39,$$$,Other
59,48.27,6.73,Male,No,Sat,Dinner,4,48,$$$,Other
141,34.30,6.70,Male,No,Thur,Lunch,6,34,$$$,Other
...,...,...,...,...,...,...,...,...,...,...
0,16.99,1.01,Female,No,Sun,Dinner,2,17,$$,Other
236,12.60,1.00,Male,Yes,Sat,Dinner,2,13,$$,Other
111,7.25,1.00,Female,No,Sat,Dinner,1,7,$,Other
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3,$,Generous


In [26]:
df.sort_values(['tip','size']) # firstly by tip , then by size

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,roundOff,costly,Quality
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3,$,Generous
111,7.25,1.00,Female,No,Sat,Dinner,1,7,$,Other
92,5.75,1.00,Female,Yes,Fri,Dinner,2,6,$,Other
236,12.60,1.00,Male,Yes,Sat,Dinner,2,13,$$,Other
0,16.99,1.01,Female,No,Sun,Dinner,2,17,$$,Other
...,...,...,...,...,...,...,...,...,...,...
141,34.30,6.70,Male,No,Thur,Lunch,6,34,$$$,Other
59,48.27,6.73,Male,No,Sat,Dinner,4,48,$$$,Other
23,39.42,7.58,Male,No,Sat,Dinner,4,39,$$$,Other
212,48.33,9.00,Male,No,Sat,Dinner,4,48,$$$,Other


In [27]:
# index of min,max value
df['total_bill'].max()

50.81

In [28]:
df['total_bill'].idxmax()

170

In [29]:
df.iloc[170]

total_bill     50.81
tip             10.0
sex             Male
smoker           Yes
day              Sat
time          Dinner
size               3
roundOff          51
costly           $$$
Quality        Other
Name: 170, dtype: object

In [30]:
df.iloc[df['tip'].idxmin()]

total_bill        3.07
tip                1.0
sex             Female
smoker             Yes
day                Sat
time            Dinner
size                 1
roundOff             3
costly               $
Quality       Generous
Name: 67, dtype: object

In [31]:
df=pd.read_csv('tips.csv')

In [34]:
#  correlation of columns    (pearson correlation coefficient)
df.corr(numeric_only=True)

Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.675734,0.598315
tip,0.675734,1.0,0.489299
size,0.598315,0.489299,1.0


In [35]:
# counts of unique values
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [36]:
# unique values
df['day'].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [37]:
# no. of unique values
df['day'].nunique()

4

In [38]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [39]:
#  Replacing values
df['sex'].replace(['Female','Male'],['F','M'])

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [42]:
#  changing by doing mapping
my_map={'Female':'F','Male':'M'}      # easy to maintain on large scale

In [43]:
df['sex'].map(my_map)

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [44]:
#  to know if rows are duplicated
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Length: 244, dtype: bool

In [61]:
simple_df=pd.DataFrame({'a':[1,2,2],'b':[1,3,3]})

In [62]:
simple_df

Unnamed: 0,a,b
0,1,1
1,2,3
2,2,3


In [63]:
simple_df.duplicated()

0    False
1    False
2     True
dtype: bool

In [64]:
#  dropping duplicates[]
simple_df.drop_duplicates()

Unnamed: 0,a,b
0,1,1
1,2,3


In [67]:
df[df['total_bill'].between(10,20,inclusive='both')]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
234,15.53,3.00,Male,Yes,Sat,Dinner,2
235,10.07,1.25,Male,No,Sat,Dinner,2
236,12.60,1.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [68]:
#  n largest value rows
df.nlargest(10,'tip')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
23,39.42,7.58,Male,No,Sat,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
141,34.3,6.7,Male,No,Thur,Lunch,6
183,23.17,6.5,Male,Yes,Sun,Dinner,4
214,28.17,6.5,Female,Yes,Sat,Dinner,3
47,32.4,6.0,Male,No,Sun,Dinner,4
239,29.03,5.92,Male,No,Sat,Dinner,3
88,24.71,5.85,Male,No,Thur,Lunch,2


In [70]:
#  similar
df.sort_values('tip',ascending=False).iloc[0:10]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
23,39.42,7.58,Male,No,Sat,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
141,34.3,6.7,Male,No,Thur,Lunch,6
214,28.17,6.5,Female,Yes,Sat,Dinner,3
183,23.17,6.5,Male,Yes,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
239,29.03,5.92,Male,No,Sat,Dinner,3
88,24.71,5.85,Male,No,Thur,Lunch,2


In [71]:
#  picking up random samples
df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
202,13.0,2.0,Female,Yes,Thur,Lunch,2
134,18.26,3.25,Female,No,Thur,Lunch,2
165,24.52,3.48,Male,No,Sun,Dinner,3
173,31.85,3.18,Male,Yes,Sun,Dinner,2
217,11.59,1.5,Male,Yes,Sat,Dinner,2


In [72]:
df.sample(frac=0.1)     # 10%  of total (useful in picking up test _data_set)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
141,34.3,6.7,Male,No,Thur,Lunch,6
13,18.43,3.0,Male,No,Sun,Dinner,4
138,16.0,2.0,Male,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
80,19.44,3.0,Male,Yes,Thur,Lunch,2
155,29.85,5.14,Female,No,Sun,Dinner,5
232,11.61,3.39,Male,No,Sat,Dinner,2
154,19.77,2.0,Male,No,Sun,Dinner,4
30,9.55,1.45,Male,No,Sat,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
