In [36]:
import seaborn as sb
import pandas as pd
from sklearn import preprocessing as pp
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [68]:
data=sb.load_dataset('tips')
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


Transformando 'day' e 'time' em novas features

In [38]:
days_times = pd.get_dummies(data[['day','time']], drop_first=True, dtype=int)
data = data.drop(['day','time'], axis=1)
data = pd.concat([data, days_times], axis=1)
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,size,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,Female,No,2,0,0,1,1
1,10.34,1.66,Male,No,3,0,0,1,1
2,21.01,3.5,Male,No,3,0,0,1,1
3,23.68,3.31,Male,No,2,0,0,1,1
4,24.59,3.61,Female,No,4,0,0,1,1


Codificando valores qualitativos 'sex' e 'smoker'   

In [39]:
labels = ["sex", "smoker"]
le = pp.LabelEncoder()

for label in labels:

    le.fit(data[label])
    data[label] = le.transform(data[label])

data.head()

Unnamed: 0,total_bill,tip,sex,smoker,size,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,0,0,2,0,0,1,1
1,10.34,1.66,1,0,3,0,0,1,1
2,21.01,3.5,1,0,3,0,0,1,1
3,23.68,3.31,1,0,2,0,0,1,1
4,24.59,3.61,0,0,4,0,0,1,1


### Discretizando valores de 'tip'
Os valores serão discretizados conforme os quantis 35% e 70%

In [40]:
qt = data["tip"].quantile([0.35, 0.70])

print(f"For the quantiles {qt.array[0]} and {qt.array[1]}")

discrete_tips = data.copy()
discrete_tips["tip"] = discrete_tips["tip"].apply(lambda x: "small" if x < qt.array[0] else "average" if x < qt.array[1] else "high")
discrete_tips.head()


For the quantiles 2.181 and 3.48


Unnamed: 0,total_bill,tip,sex,smoker,size,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,small,0,0,2,0,0,1,1
1,10.34,small,1,0,3,0,0,1,1
2,21.01,high,1,0,3,0,0,1,1
3,23.68,average,1,0,2,0,0,1,1
4,24.59,high,0,0,4,0,0,1,1


In [41]:
# FinalData.to_csv("Final_Data_atv1.csv", index=False)

In [42]:
min_max_scaler = MinMaxScaler()
std_scaler = StandardScaler()

min_max_data = data.copy()
std_data = data.copy()

min_max_data[["total_bill", "tip"]] = min_max_scaler.fit_transform(min_max_data[["total_bill", "tip"]])
std_data[["total_bill", "tip"]] = std_scaler.fit_transform(std_data[["total_bill", "tip"]])

In [51]:
min_max_Y = min_max_data["tip"]
std_Y = std_data["tip"]

min_max_X = min_max_data.drop("tip", axis=1)
std_X = std_data.drop("tip", axis=1)

discrete_tips_Y = discrete_tips["tip"]
discrete_tips_X = discrete_tips.drop("tip", axis=1)

min_max_X_train, min_max_X_test, min_max_Y_train, min_max_Y_test = train_test_split(min_max_X, min_max_Y, test_size=0.10, random_state=42)
std_X_train, std_X_test, std_Y_train, std_Y_test = train_test_split(std_X, std_Y, test_size=0.10, random_state=42)   
discrete_tips_X_train, discrete_tips_X_test, discrete_tips_Y_train, discrete_tips_Y_test = train_test_split(discrete_tips_X, discrete_tips_Y, test_size=0.1, random_state=42)

In [66]:
rfRegressor_min_max = RandomForestRegressor()
rfREgressor_std = RandomForestRegressor()
rfClassifier = RandomForestClassifier()

rfRegressor_min_max.fit(min_max_X_train, min_max_Y_train)
rfREgressor_std.fit(std_X_train, std_Y_train)
rfClassifier.fit(discrete_tips_X_train, discrete_tips_Y_train)


In [67]:
print("Min-Max Scaled Regressor Score:", rfRegressor_min_max.score(min_max_X_test, min_max_Y_test))
print("Standard Scaled Regressor Score:", rfREgressor_std.score(std_X_test, std_Y_test))
print("Classifier Score:", rfClassifier.score(discrete_tips_X_test, discrete_tips_Y_test))


Min-Max Scaled Regressor Score: 0.24115137602031567
Standard Scaled Regressor Score: 0.2844678476328679
Classifier Score: 0.6
