## Librairies

In [414]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder,  OneHotEncoder, LabelBinarizer
from sklearn import set_config
set_config(transform_output= "pandas")

### 1- Data tips

In [415]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [416]:
df.nunique()

total_bill    229
tip           123
sex             2
smoker          2
day             4
time            2
size            6
dtype: int64

In [417]:
df['day'].unique(), df['smoker'].unique(), df['time'].unique(), df['sex'].unique()

(['Sun', 'Sat', 'Thur', 'Fri']
 Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun'],
 ['No', 'Yes']
 Categories (2, object): ['Yes', 'No'],
 ['Dinner', 'Lunch']
 Categories (2, object): ['Lunch', 'Dinner'],
 ['Female', 'Male']
 Categories (2, object): ['Male', 'Female'])

In [418]:
# OneHot encoding sur 'sex', 'smoker', 'time'
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
df_encoder = encoder.fit_transform(df[['sex','smoker','time', 'day']])

df_tips = pd.concat((df.drop(columns = ['sex','smoker','time', 'day']), df_encoder), axis=1)
df_tips.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,time_Lunch,day_Sat,day_Sun,day_Thur
0,16.99,1.01,2,0.0,0.0,0.0,0.0,1.0,0.0
1,10.34,1.66,3,1.0,0.0,0.0,0.0,1.0,0.0
2,21.01,3.5,3,1.0,0.0,0.0,0.0,1.0,0.0
3,23.68,3.31,2,1.0,0.0,0.0,0.0,1.0,0.0
4,24.59,3.61,4,0.0,0.0,0.0,0.0,1.0,0.0


### 2- Data Penguins

In [419]:
df = sns.load_dataset('penguins')
# target : species
df.dropna(inplace=True)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [420]:
df.nunique()
df['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [421]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
df_encoder = encoder.fit_transform(df[['island','sex']])
df_penguins = pd.concat((df.drop(columns=['island','sex']), df_encoder), axis=1)
df_penguins.head()


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_Male
0,Adelie,39.1,18.7,181.0,3750.0,0.0,1.0,1.0
1,Adelie,39.5,17.4,186.0,3800.0,0.0,1.0,0.0
2,Adelie,40.3,18.0,195.0,3250.0,0.0,1.0,0.0
4,Adelie,36.7,19.3,193.0,3450.0,0.0,1.0,0.0
5,Adelie,39.3,20.6,190.0,3650.0,0.0,1.0,1.0


In [422]:
# Encoding ordinal de la  target species
LabelEncoder().fit_transform(df[['species']])


  y = column_or_1d(y, warn=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [423]:
# Binarizer Encoding  de la  target species
LabelBinarizer().fit_transform(df[['species']])

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1,

### 3-  Data Flighs 

In [424]:
df = sns.load_dataset('flights')
# target : passengers
df.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [425]:
df['month'].unique()

['Jan', 'Feb', 'Mar', 'Apr', 'May', ..., 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
Length: 12
Categories (12, object): ['Jan', 'Feb', 'Mar', 'Apr', ..., 'Sep', 'Oct', 'Nov', 'Dec']

In [426]:
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec']
encoder = OrdinalEncoder(categories=[month_order])
encoder.fit_transform(df[["month"]]).head()

Unnamed: 0,month
0,0.0
1,1.0
2,2.0
3,3.0
4,4.0


### 4- Data Exercises

In [427]:
df = sns.load_dataset('exercise')
# target : pulse
df.head()

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind
0,0,1,low fat,85,1 min,rest
1,1,1,low fat,85,15 min,rest
2,2,1,low fat,88,30 min,rest
3,3,2,low fat,90,1 min,rest
4,4,2,low fat,92,15 min,rest


In [428]:
df['diet'].unique(), df['kind'].unique(), df['time'].unique()

(['low fat', 'no fat']
 Categories (2, object): ['no fat', 'low fat'],
 ['rest', 'walking', 'running']
 Categories (3, object): ['rest', 'walking', 'running'],
 ['1 min', '15 min', '30 min']
 Categories (3, object): ['1 min', '15 min', '30 min'])

In [429]:
order_kind = ['rest', 'walking', 'running']
order_diet = ['no fat','low fat']
encoder = OrdinalEncoder(categories=[order_kind, order_diet])
encoder.fit_transform(df[['kind','diet',]])

Unnamed: 0,kind,diet
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
85,2.0,0.0
86,2.0,0.0
87,2.0,0.0
88,2.0,0.0


In [430]:
# premiere maniere pour encoder time
df['time'].str.replace('min', '').astype(float)

0      1.0
1     15.0
2     30.0
3      1.0
4     15.0
      ... 
85    15.0
86    30.0
87     1.0
88    15.0
89    30.0
Name: time, Length: 90, dtype: float64

In [431]:
# deuxieme  maniere pour encoder time
df["time"].replace({'1 min':1,'15 min':15,'30 min':30}).astype(float)


  df["time"].replace({'1 min':1,'15 min':15,'30 min':30}).astype(float)
  df["time"].replace({'1 min':1,'15 min':15,'30 min':30}).astype(float)


0      1.0
1     15.0
2     30.0
3      1.0
4     15.0
      ... 
85    15.0
86    30.0
87     1.0
88    15.0
89    30.0
Name: time, Length: 90, dtype: float64

### -5 data taxis

In [436]:
df = sns.load_dataset('taxis')
# target : total
df.dropna(inplace=True)
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [433]:
df.nunique()

pickup             6322
dropoff            6333
passengers            7
distance           1064
fare                206
tip                 481
tolls                14
total               877
color                 2
payment               2
pickup_zone         194
dropoff_zone        203
pickup_borough        4
dropoff_borough       5
dtype: int64

In [434]:
df['payment'].unique()

array(['credit card', 'cash'], dtype=object)

In [435]:
encoder = OneHotEncoder(sparse_output = False, drop  ='first', handle_unknown = 'ignore')
encoder.fit(df[['pickup_borough', 'dropoff_borough', 'color', 'payment']])
encoder.transform(df[['pickup_borough', 'dropoff_borough', 'color', 'payment']])

Unnamed: 0,pickup_borough_Brooklyn,pickup_borough_Manhattan,pickup_borough_Queens,dropoff_borough_Brooklyn,dropoff_borough_Manhattan,dropoff_borough_Queens,dropoff_borough_Staten Island,color_yellow,payment_credit card
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
6428,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6429,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
6430,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6431,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
