## **1. Reading the csv file:**

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/sample_data/tips.csv')

**a) To check name of columns in dataframe: ITS NOT A METHOD BUT ATTRIBUTE**

In [3]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person', 'Payer Name', 'CC Number', 'Payment ID'],
      dtype='object')

**b) To check index range of data in dataframe: ITS NOT A METHOD BUT ATTRIBUTE**

In [4]:
df.index

RangeIndex(start=0, stop=244, step=1)

*So, Number of rows = stop - start = 244 - 0 = 244*

**c) To understand the general structure of dataframe: use head() function**

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


*To check 10 top rows, use **head(10)**. To check last 5 rows, use **tail()**. To check last 10 rows, use **tail(10)**.*

**d) To get statistical information of the dataframe:**

In [6]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.888197,2563496000000000.0
std,8.902412,1.383638,0.9511,2.914234,2369340000000000.0
min,3.07,1.0,1.0,2.88,60406790000.0
25%,13.3475,2.0,2.0,5.8,30407310000000.0
50%,17.795,2.9,2.0,7.255,3525318000000000.0
75%,24.1275,3.5625,3.0,9.39,4553675000000000.0
max,50.81,10.0,6.0,20.27,6596454000000000.0


In [7]:
# you can transpose to see the other side of view

(df.describe()).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.78594,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9510998,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.888197,2.914234,2.88,5.8,7.255,9.39,20.27
CC Number,244.0,2563496000000000.0,2369340000000000.0,60406790000.0,30407310000000.0,3525318000000000.0,4553675000000000.0,6596454000000000.0


## **2. Working with Columns:**

In [8]:
type(df['total_bill'])

pandas.core.series.Series

**a) Grabbing multiple columns:**

In [9]:
# METHOD - 1

columnstograb = ['total_bill', 'tip']
df[columnstograb].head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


In [10]:
# METHOD - 2

df[['total_bill', 'tip']].head()

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


**b) Creating a new column:**

In [11]:
# This new column will be created at end
# If such name column already exists, that will be overwritten

df['tip_prec'] = 100*df['total_bill'] / df['tip']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_prec
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,1682.178218
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,622.891566
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,600.285714
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,715.407855
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,681.163435


**c) Controlling precision of any column:**

In [12]:
df['tip_prec'] = df['tip_prec'].round(2)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_prec
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,1682.18
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,622.89
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,600.29
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,715.41
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,681.16


**d) Removing columns:**


*   To drop a column, must write axis = 1. 
*   To drop a row, no need to write axis = 0 as it is set as default.
*   To make changes permanent, either use **df =**, or use **inplace = True**




In [13]:
df = df.drop('tip_prec', axis=1)

## **3. Working with Rows:**

The index in dataframe is considered as **Primary key** which has the values as unique.

If we want to set some column of data as index, make sure that it is having unique value for each row.

In [14]:
df.set_index('Payment ID', inplace = True)

In [15]:
df.head()

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230
Sun4458,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221


In [34]:
df.reset_index(inplace = True)

**a) To grab a row using integer index: use iloc[ ]**

> Here **i** in **iloc** means integer

In [16]:
df.iloc[0]

total_bill                       16.99
tip                               1.01
sex                             Female
smoker                              No
day                                Sun
time                            Dinner
size                                 2
price_per_person                  8.49
Payer Name          Christy Cunningham
CC Number             3560325168603410
Name: Sun2959, dtype: object

In [17]:
# to grap multiple rows

df.iloc[0:4]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun2959,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230
Sun4458,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994


**b) To grab a row using real named index: use loc[ ]**

In [18]:
# we are assuming here we have named index of Payment ID

df.loc['Sun5260']

total_bill                     23.68
tip                             3.31
sex                             Male
smoker                            No
day                              Sun
time                          Dinner
size                               2
price_per_person               11.84
Payer Name          Nathaniel Harris
CC Number           4676137647685994
Name: Sun5260, dtype: object

In [20]:
# to grap multiple rows for named index: same like multiple columns grabbing

# Method - 1
rowstograb = ['Sun5260', 'Sun4608']
df.loc[rowstograb]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230


In [21]:
# Method - 2
 
df.loc[['Sun5260', 'Sun4608']]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun4608,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230


**c) To drop a row:**

In [22]:
# deleting using named index

df.drop('Sun4608', inplace = True)

In [23]:
# to delete using numeric index, must use slicing with iloc

df = df.iloc[1:] 

**d) Inserting a new row:**

In [24]:
# copy the first row and append it. It will be appended at last automatically. It will create duplication but pandas is Ok with that!

new_row = df.iloc[0]
df = df.append(new_row)

In [25]:
df

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221
Sun9679,25.29,4.71,Male,No,Sun,Dinner,4,6.32,Erik Smith,213140353657882
Sun5985,8.77,2.00,Male,No,Sun,Dinner,2,4.38,Kristopher Johnson,2223727524230344
...,...,...,...,...,...,...,...,...,...,...
Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404
Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950
Thur672,18.78,3.00,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139


## **4. Conditional Filtering:**

In [26]:
df['size'] > 4

Payment ID
Sun4458    False
Sun5260    False
Sun2251    False
Sun9679    False
Sun5985    False
           ...  
Sat1766    False
Sat3880    False
Sat17      False
Thur672    False
Sun4458    False
Name: size, Length: 243, dtype: bool

In [27]:
df[df['size'] > 4]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Thur3948,29.8,4.2,Female,No,Thur,Lunch,6,4.97,Angela Sanchez,503857080488
Thur1025,34.3,6.7,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508
Thur3621,41.19,5.0,Male,No,Thur,Lunch,5,8.24,Eric Andrews,4356531761046453
Thur6179,27.05,5.0,Female,No,Thur,Lunch,6,4.51,Regina Jones,4311048695487
Sun9176,29.85,5.14,Female,No,Sun,Dinner,5,5.97,Madison Wilson,4210875236164664
Sun7518,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321
Sun5842,20.69,5.0,Male,No,Sun,Dinner,5,4.14,Joseph Howell,30362407455623
Sun9987,30.46,2.0,Male,Yes,Sun,Dinner,5,6.09,David Barrett,4792882899700988
Sat7320,28.15,3.0,Male,Yes,Sat,Dinner,5,5.63,Shawn Barnett PhD,4590982568244


***To apply multiple conditions, must use & and |. Never use |AND, OR because they apply condition as a whole but & and | apply condition by reading one by one row.***

Syntax: 
*   **df[ ( condition2)  & (condition2)]**
*   **df[ ( condition2)  | (condition2)]**

In [28]:
df[(df['size'] > 4) & (df['tip'] > 2.00)]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Thur3948,29.8,4.2,Female,No,Thur,Lunch,6,4.97,Angela Sanchez,503857080488
Thur1025,34.3,6.7,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508
Thur3621,41.19,5.0,Male,No,Thur,Lunch,5,8.24,Eric Andrews,4356531761046453
Thur6179,27.05,5.0,Female,No,Thur,Lunch,6,4.51,Regina Jones,4311048695487
Sun9176,29.85,5.14,Female,No,Sun,Dinner,5,5.97,Madison Wilson,4210875236164664
Sun7518,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321
Sun5842,20.69,5.0,Male,No,Sun,Dinner,5,4.14,Joseph Howell,30362407455623
Sat7320,28.15,3.0,Male,Yes,Sat,Dinner,5,5.63,Shawn Barnett PhD,4590982568244


In [29]:
df[(df['day'] == 'Sat') | (df['day'] == 'Sun')]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221
Sun9679,25.29,4.71,Male,No,Sun,Dinner,4,6.32,Erik Smith,213140353657882
Sun5985,8.77,2.00,Male,No,Sun,Dinner,2,4.38,Kristopher Johnson,2223727524230344
...,...,...,...,...,...,...,...,...,...,...
Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842
Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404
Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950


In [30]:
# if we want to apply multiple conditions on same column
options = ['Sat', 'Sun']
df[df['day'].isin(options)]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sun4458,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221
Sun9679,25.29,4.71,Male,No,Sun,Dinner,4,6.32,Erik Smith,213140353657882
Sun5985,8.77,2.00,Male,No,Sun,Dinner,2,4.38,Kristopher Johnson,2223727524230344
...,...,...,...,...,...,...,...,...,...,...
Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842
Sat1766,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404
Sat3880,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950


## **5. Applying Customized Functions on Columns:**

The customize function should return only single value in each calling. Otherwise it will give error.

In [31]:
def takelast4digits(num):
  return int(str(num)[:4])     

# we cannot do slicing with int, so must convert int to string then do slicing and then convert back to int

In [32]:
df['CC Number'].apply(takelast4digits)  # do not write the parameter brackets in it. Pandas will do it automatically by considering one row each time as data for the function

Payment ID
Sun4458    6011
Sun5260    4676
Sun2251    4832
Sun9679    2131
Sun5985    2223
           ... 
Sat1766    3506
Sat3880    6011
Sat17      4375
Thur672    3511
Sun4458    6011
Name: CC Number, Length: 243, dtype: int64

In [33]:
# Method 2
df['CC Number'].apply(lambda num:int(str(num)[:4]))

Payment ID
Sun4458    6011
Sun5260    4676
Sun2251    4832
Sun9679    2131
Sun5985    2223
           ... 
Sat1766    3506
Sat3880    6011
Sat17      4375
Thur672    3511
Sun4458    6011
Name: CC Number, Length: 243, dtype: int64

**If we want to apply customize function of multiple columns:**

In [34]:
def quality(total_bill, tip):
  if tip/total_bill > 0.25:
    return 'Generous'
  else:
    return 'Other'

In [35]:
df['Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])   # np.vectorize(functiona_name)(columns_names)
df.head()

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun4458,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Other
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Other
Sun2251,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Other
Sun9679,25.29,4.71,Male,No,Sun,Dinner,4,6.32,Erik Smith,213140353657882,Other
Sun5985,8.77,2.0,Male,No,Sun,Dinner,2,4.38,Kristopher Johnson,2223727524230344,Other


## **6. Statistical Analysis of the data**

In [36]:
# to sort the data according to a specific column

df.sort_values('tip')

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sat3455,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Generous
Fri3780,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Other
Sat5032,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Other
Sat4801,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Other
Sat6983,12.90,1.10,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Other
...,...,...,...,...,...,...,...,...,...,...,...
Thur1025,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Other
Sat8139,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Other
Sat239,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Other
Sat4590,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Other


> By default, the sorting order is ascending. If we want to sort it in descending way, we need to write **ascending = False**.

In [37]:
df.sort_values('tip', ascending=False)

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sat1954,50.81,10.00,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Other
Sat4590,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Other
Sat239,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Other
Sat8139,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Other
Thur1025,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Other
...,...,...,...,...,...,...,...,...,...,...,...
Sat6983,12.90,1.10,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Other
Sat4801,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Other
Sat5032,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Other
Fri3780,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Other


In [38]:
# to sort according to multiple columns, firstly values will be sorted according to first column and then will be sorted according to second column

df.sort_values(['tip', 'size'])

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sat3455,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Generous
Sat4801,7.25,1.00,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Other
Fri3780,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Other
Sat5032,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Other
Sat6983,12.90,1.10,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Other
...,...,...,...,...,...,...,...,...,...,...,...
Thur1025,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Other
Sat8139,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Other
Sat239,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Other
Sat4590,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Other


In [39]:
# to find the max of a column
df['tip'].max()

10.0

In [40]:
df['tip'].min()

1.0

In [42]:
# to get index of the maximum value in a column
df['tip'].idxmax()

'Sat1954'

In [43]:
df['tip'].idxmin()

'Sat3455'

In [44]:
# to get row with highest tip value
df.loc[df['tip'].idxmax()]

total_bill                     50.81
tip                             10.0
sex                             Male
smoker                           Yes
day                              Sat
time                          Dinner
size                               3
price_per_person               16.94
Payer Name             Gregory Clark
CC Number           5473850968388236
Quality                        Other
Name: Sat1954, dtype: object

In [45]:
# to get how much the columns are correlated to each other
df.corr()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.675512,0.601708,0.645608,0.109451
tip,0.675512,1.0,0.491509,0.345743,0.119122
size,0.601708,0.491509,1.0,-0.173534,-0.027946
price_per_person,0.645608,0.345743,-0.173534,1.0,0.138402
CC Number,0.109451,0.119122,-0.027946,0.138402,1.0


In [47]:
df[['total_bill', 'size']].corr()

Unnamed: 0,total_bill,size
total_bill,1.0,0.601708
size,0.601708,1.0


In [48]:
df['sex'].value_counts()

Male      157
Female     86
Name: sex, dtype: int64

In [49]:
df['sex'].unique()    # All names of unique enteries in column

array(['Male', 'Female'], dtype=object)

In [51]:
df['tip'].nunique()    # total number of unique enteries in column

121

In [52]:
# Alternative of df['tip'].nunique()
len(df['tip'].unique())

121

### **Replacing values:**

In [53]:
# Method 1
df['sex'].replace('Male', 'M')

Payment ID
Sun4458         M
Sun5260         M
Sun2251    Female
Sun9679         M
Sun5985         M
            ...  
Sat1766    Female
Sat3880         M
Sat17           M
Thur672    Female
Sun4458         M
Name: sex, Length: 243, dtype: object

In [55]:
# Method 2

replace_vals = {'Female': 'F', 'Male': 'M'}
df['sex'].replace(replace_vals)

Payment ID
Sun4458    M
Sun5260    M
Sun2251    F
Sun9679    M
Sun5985    M
          ..
Sat1766    F
Sat3880    M
Sat17      M
Thur672    F
Sun4458    M
Name: sex, Length: 243, dtype: object

### **Dealing with duplicated rows:**

It return True for all duplicated row and False for unique rows.

In [56]:
df.duplicated()

Payment ID
Sun4458    False
Sun5260    False
Sun2251    False
Sun9679    False
Sun5985    False
           ...  
Sat1766    False
Sat3880    False
Sat17      False
Thur672    False
Sun4458     True
Length: 243, dtype: bool

In [71]:
# to drop duplicate rows
df = df.drop_duplicates()

In [72]:
df.duplicated()

Payment ID
Sun4458    False
Sun5260    False
Sun2251    False
Sun9679    False
Sun5985    False
           ...  
Sat2657    False
Sat1766    False
Sat3880    False
Sat17      False
Thur672    False
Length: 242, dtype: bool

Filter: To check values in a range, we can use **between method.**

In [74]:
df['total_bill'].between(10,20)

Payment ID
Sun4458    False
Sun5260    False
Sun2251    False
Sun9679    False
Sun5985    False
           ...  
Sat2657    False
Sat1766    False
Sat3880    False
Sat17       True
Thur672     True
Name: total_bill, Length: 242, dtype: bool

In [75]:
# to see actual rows with this filter

df[df['total_bill'].between(10,20)]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun6820,15.04,1.96,Male,No,Sun,Dinner,2,7.52,Joseph Mcdonald,3522866365840377,Other
Sun3775,14.78,3.23,Male,No,Sun,Dinner,2,7.39,Jerome Abbott,3532124519049786,Other
Sun2546,10.27,1.71,Male,No,Sun,Dinner,2,5.14,William Riley,566287581219,Other
Sun1300,15.42,1.57,Male,No,Sun,Dinner,2,7.71,Chad Harrington,577040572932,Other
Sun2971,18.43,3.00,Male,No,Sun,Dinner,4,4.61,Joshua Jones,6011163105616890,Other
...,...,...,...,...,...,...,...,...,...,...,...
Sat7220,15.53,3.00,Male,Yes,Sat,Dinner,2,7.76,Tracy Douglas,4097938155941930,Other
Sat4615,10.07,1.25,Male,No,Sat,Dinner,2,5.04,Sean Gonzalez,3534021246117605,Other
Sat5032,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Other
Sat17,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Other


To check largest 10 rows, use **nlargest()** method. It has **descending = True**.

In [78]:
df.nlargest(10, 'tip')

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sat1954,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Other
Sat4590,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Other
Sat239,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Other
Sat8139,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Other
Thur1025,34.3,6.7,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Other
Sun6059,23.17,6.5,Male,Yes,Sun,Dinner,4,5.79,Dr. Michael James,4718501859162,Generous
Sat3374,28.17,6.5,Female,Yes,Sat,Dinner,3,9.39,Marissa Jackson,4922302538691962,Other
Sun9677,32.4,6.0,Male,No,Sun,Dinner,4,8.1,James Barnes,3552002592874186,Other
Sat2657,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Other
Thur9003,24.71,5.85,Male,No,Thur,Lunch,2,12.36,Roger Taylor,4410248629955,Other


In [79]:
df.nsmallest(10, 'tip')

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sat3455,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Generous
Fri3780,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Other
Sat4801,7.25,1.0,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Other
Sat5032,12.6,1.0,Male,Yes,Sat,Dinner,2,6.3,Matthew Myers,3543676378973965,Other
Sat6983,12.9,1.1,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Other
Sat2929,32.83,1.17,Male,Yes,Sat,Dinner,2,16.42,Thomas Brown,4284722681265508,Other
Sat5056,10.51,1.25,Male,No,Sat,Dinner,2,5.26,Kenneth Hayes,213142079731108,Other
Thur6600,8.51,1.25,Female,No,Thur,Lunch,2,4.26,Rebecca Harris,4320272020376174,Other
Sat4615,10.07,1.25,Male,No,Sat,Dinner,2,5.04,Sean Gonzalez,3534021246117605,Other
Sun3279,9.68,1.32,Male,No,Sun,Dinner,2,4.84,Christopher Spears,4387671121369212,Other


In [80]:
# to get random 5 rows

df.sample(5)

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sat3697,21.7,4.3,Male,No,Sat,Dinner,2,10.85,David Collier,5529694315416009,Other
Sun9209,7.25,5.15,Male,Yes,Sun,Dinner,2,3.62,Larry White,30432617123103,Generous
Sun9774,18.04,3.0,Male,No,Sun,Dinner,2,9.02,William Roth,6573923967142503,Other
Fri5959,13.42,1.58,Male,Yes,Fri,Lunch,2,6.71,Ronald Vaughn DVM,341503466406403,Other
Sun5205,17.26,2.74,Male,No,Sun,Dinner,3,5.75,Gregory Smith,4292362333741,Other


In [81]:
# to get 10% rows of dataframe

df.sample(frac=0.1)

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Quality
Payment ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sun2127,13.13,2.0,Male,No,Sun,Dinner,2,6.56,Jason Arnold,3571825125296106,Other
Sun5260,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Other
Sat5056,10.51,1.25,Male,No,Sat,Dinner,2,5.26,Kenneth Hayes,213142079731108,Other
Sat6240,44.3,2.5,Female,Yes,Sat,Dinner,3,14.77,Heather Cohen,379771118886604,Other
Fri5700,22.49,3.5,Male,No,Fri,Dinner,2,11.24,Earl Horn,6011849326227398,Other
Sun444,17.51,3.0,Female,Yes,Sun,Dinner,2,8.76,Audrey Griffin,3500853929693258,Other
Sat8124,17.78,3.27,Male,No,Sat,Dinner,2,8.89,Jacob Castillo,3551492000704805,Other
Sat6983,12.9,1.1,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Other
Sun5814,19.77,2.0,Male,No,Sun,Dinner,4,4.94,James Smith,213169731428229,Other
Sat9213,20.65,3.35,Male,No,Sat,Dinner,3,6.88,Timothy Oneal,6568069240986485,Other
