In [36]:
from pydataset import data
import numpy as np 
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
import sklearn.preprocessing

## 1. Load the tips dataset

In [2]:
tips = data('tips')

In [3]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


In [4]:
tips.shape

(244, 7)

In [5]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [6]:
tips.sex = np.where(tips['sex'] == 'Male', 1, 0)

In [7]:
tips.sex

1      0
2      1
3      1
4      1
5      0
      ..
240    1
241    0
242    1
243    1
244    0
Name: sex, Length: 244, dtype: int64

In [8]:
tips.smoker = np.where(tips['smoker'] == "Yes", 1, 0)

In [9]:
tips.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [10]:
#sat:87, sun:76, thur:62, fri:19

In [11]:
tips.day = np.where(tips['day'] == 'Sat', 4, tips['day'])

In [12]:
tips.day = np.where(tips['day'] == 'Sun', 3, tips['day'])

In [13]:
tips.day = np.where(tips['day'] == 'Thur', 2, tips['day'])

In [14]:
tips.day = np.where(tips['day'] == 'Fri', 1, tips['day'])

In [15]:
tips.day.value_counts()

4    87
3    76
2    62
1    19
Name: day, dtype: int64

In [16]:
tips.day = tips.day.astype('int')

In [17]:
tips.time.value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [18]:
tips.time = np.where(tips['time'] == 'Dinner', 1, 0)

In [19]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int64  
 3   smoker      244 non-null    int64  
 4   day         244 non-null    int64  
 5   time        244 non-null    int64  
 6   size        244 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 15.2 KB


### 1a. Create a column named price_per_person. This should be the total bill divided by the party size.


In [20]:
tips['price_per_person'] = tips.total_bill / tips.size

In [21]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,0,0,3,1,2,0.009947
2,10.34,1.66,1,0,3,1,3,0.006054
3,21.01,3.5,1,0,3,1,3,0.012301


In [22]:
train, test = train_test_split(tips, test_size = .3, random_state = 123)

In [23]:
y_train = train.tip

In [24]:
x_train = train.drop(columns = 'tip')

In [25]:
y_test = test.tip
x_test = test.drop(columns = 'tip')

In [26]:
x_train.head(3)

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
149,9.78,1,0,2,0,2,0.005726
214,13.27,0,1,4,1,2,0.007769
179,9.6,0,1,3,1,2,0.005621


In [27]:
y_train.head(3)

149    1.73
214    2.50
179    4.00
Name: tip, dtype: float64

In [28]:
x_test.head(3)

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
113,38.07,1,0,3,1,3,0.022289
20,20.65,1,0,4,1,3,0.01209
188,30.46,1,1,3,1,5,0.017834


In [29]:
y_test.head(3)

113    4.00
20     3.35
188    2.00
Name: tip, dtype: float64

In [30]:
# scale the X
scaler = sklearn.preprocessing.MinMaxScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

### 1b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- total_bill
- size
- price_per_person

### 1c. Use select k best to select the top 2 features for predicting tip amount. What are they?


In [32]:
f_selector = SelectKBest(f_regression, k=2)

f_selector.fit(x_train, y_train)

feature_mask = f_selector.get_support()

In [37]:
f_feature = x_train_scaled.iloc[:,feature_mask].columns.tolist()

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [39]:
feature_mask

array([ True, False, False, False, False, False,  True])

In [40]:
# total_bill and price_per_person