In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/fastfood-nutrition/fastfood.csv


In [15]:
kentucky_data=pd.read_csv('/kaggle/input/fastfood-nutrition/fastfood.csv')
X_test_full= pd.read_csv('/kaggle/input/fastfood-nutrition/fastfood.csv')
kentucky_data.columns

Index(['restaurant', 'item', 'calories', 'cal_fat', 'total_fat', 'sat_fat',
       'trans_fat', 'cholesterol', 'sodium', 'total_carb', 'fiber', 'sugar',
       'protein', 'vit_a', 'vit_c', 'calcium', 'salad'],
      dtype='object')

In [7]:
# target
y = kentucky_data.restaurant

In [18]:
# Creating features (After completing the exercise, you can return to modify this line!)
features = ['restaurant','cal_fat','total_fat','sugar','cholesterol','sodium']
X = kentucky_data[features]
# Remove rows with missing target, separate target from predictors
kentucky_data.dropna(axis=0, subset=['restaurant'], inplace=True)
kentucky_data.drop(['restaurant'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

X.head()

Unnamed: 0,cal_fat,total_fat,sugar,cholesterol,sodium
0,60,7,11,95,1110
1,410,45,18,130,1580
2,600,67,18,220,1920
3,280,31,18,155,1940
4,410,45,18,120,1980


In [19]:
# Split into validation and training data
train_X, valid_X, train_y, valid_y = train_test_split(X, y,train_size=0.8,test_size=0.2, random_state=0)

In [None]:
# Function for comparing different approaches
# really usefull function , calculating it when testing different approaches to deal with entries 
def score_dataset(train_X, valid_X, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(train_X, y_train)
    preds = model.predict(valid_X)
    return mean_absolute_error(y_valid, preds)

In [24]:
#Dealing with Missing Values : 
# Shape of training data (rows,columns)
print(train_X.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (train_X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(412, 5)
Series([], dtype: int64)


In [None]:
# Although there is no Missing Value Here, just a training to deal with them ( if they were to exist):

#Dropping them ALL 
# Get names of columns with missing values
cols_with_missing = [col for col in train_X.columns
                     if train_X[col].isnull().any()]

# Drop columns in training and validation data
reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_X_valid = valid_X.drop(cols_with_missing, axis=1)

# Imputation
imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X))
imputed_X_valid = pd.DataFrame(imputer.transform(valid_X))

# Imputation removed column names; put them back
imputed_train_X.columns = train_X.columns
imputed_X_valid.columns = valid_X.columns


In [23]:
# Get list of categorical variables , OBVIOUSLY none here 
s = (train_X.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
[]


In [1]:
#However to train, here is the code de deal with them : Drop Categorical, Ordinal Encoding, One-hot encoding

# I - Drop :  dropping 'object' columns selected with dtypes
drop_train_X = train_X.select_dtypes(exclude=['object'])
drop_valid_X= valid_X.select_dtypes(exclude=['object'])


# II - Ordinal Encoding : (fr): attribuer un id à chaque valeur

from sklearn.preprocessing import OrdinalEncoder

# 1 . Make copy to avoid changing original data ??
label_train_X = train_X.copy()
label_valid_X = valid_X.copy()


# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

    #2.0- Probleme: si les valeurs categoriques d'entrainement et de test diffèrent, il ne sera pas possible de lier celles de validation à un id
    # on peut donc décider de les trier entre les "bonnes" et les "mauvaises" (drop les mauvaisesà)

# 2.1 Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(valid_X[col]).issubset(set(train_X[col]))]
        
# 2.2 Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

# 2.3  Drop categorical columns that will not be encoded
label_train_X = train_X.drop(bad_label_cols, axis=1)
label_valid_X = valid_X.drop(bad_label_cols, axis=1)

# 2.4 Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_train_X[good_label_cols] = ordinal_encoder.fit_transform(train_X[good_label_cols])
label_valid_X[good_label_cols] = ordinal_encoder.transform(valid_X[good_label_cols])


# III - One-hot encoding

from sklearn.preprocessing import OneHotEncoder

#3.0 Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if train_X[col].nunique() < 10] # here,  select col with  less than 10 unique entries
#3.0 Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print(low_cardinality_cols)

#3.1 Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(valid_X[object_cols]))

# 3.2 One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = valid_X.index

# 3.3 Remove categorical columns (will replace with one-hot encoding)
num_train_X = train_X.drop(object_cols, axis=1)
num_valid_X = valid_X.drop(object_cols, axis=1)

# 3.4 Add one-hot encoded columns to numerical features
OH_train_X = pd.concat([num_train_X, OH_cols_train], axis=1)
OH_valid_X = pd.concat([num_valid_X, OH_cols_valid], axis=1)

# 3.5 Ensure all columns have the sale type, str here
OH_train_X.columns = OH_train_X.columns.astype(str)
OH_valid_X.columns = OH_valid_X.columns.astype(str)

NameError: name 'train_X' is not defined