In [None]:
import pandas as pd

# Encoding
#### Label Encoding (the Basic Encoding)
-   Each category has a code as an integer from `0` through `n-1`, where n is the number of categories
-   A `-1` code is reserved for any missing values
-   Can save on memory
-   Often used in surveys
#### How to create Label Encoding?
-   convert to "category" `dtype`
-   use `Series.cat.codes`, which will convert the values to integers
	-   If the column is not ordinal, the codes will be assigned in alphabetical order. e.g., 0 for 'a', 1 for 'b', ...
#### Why do we encode categories?
Encoding can save on memory and improve performance. Reading and writing files that use codes instead of strings can save a lot of time.
#### Drawbacks of Label Encoding
-   It's not the best encoding method for machine learning (one-hot encoding is better in this case)
	-   In label encoding, a machine learning model might misinterpret categories meaning as categories starting with 'a', which have value of 0, might be given less weight than categories starting with 'b', which have value of 1 or more; however, this isn't correct!

In [None]:
used_cars = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/cars.csv")
used_cars.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [None]:
used_cars["manufacturer_code"] = used_cars["manufacturer_name"].astype("category").cat.codes
used_cars[["manufacturer_code", "manufacturer_name"]]
# Subaru is the first manufacturer name in the dataset, but is the 46th name in alphabetical order as it has been assigned a code of 45

Unnamed: 0,manufacturer_code,manufacturer_name
0,45,Subaru
1,45,Subaru
2,45,Subaru
3,45,Subaru
4,45,Subaru
...,...,...
38526,8,Chrysler
38527,8,Chrysler
38528,8,Chrysler
38529,8,Chrysler


#### Creating a Code Book

In [None]:
codes = used_cars["manufacturer_name"].astype("category").cat.codes
categories = used_cars["manufacturer_name"]

# If we place the zip function inside of a dictionary call, the unique combinations of codes and categories will be added as key-value pairs.
name_map = dict(zip(codes, categories))
name_map

{45: 'Subaru',
 24: 'LADA',
 12: 'Dodge',
 54: 'УАЗ',
 23: 'Kia',
 35: 'Opel',
 53: 'Москвич',
 1: 'Alfa Romeo',
 0: 'Acura',
 10: 'Dacia',
 27: 'Lexus',
 33: 'Mitsubishi',
 25: 'Lancia',
 9: 'Citroen',
 32: 'Mini',
 21: 'Jaguar',
 38: 'Porsche',
 44: 'SsangYong',
 11: 'Daewoo',
 15: 'Geely',
 50: 'ВАЗ',
 13: 'Fiat',
 14: 'Ford',
 39: 'Renault',
 42: 'Seat',
 40: 'Rover',
 48: 'Volkswagen',
 28: 'Lifan',
 22: 'Jeep',
 5: 'Cadillac',
 2: 'Audi',
 52: 'ЗАЗ',
 47: 'Toyota',
 51: 'ГАЗ',
 49: 'Volvo',
 7: 'Chevrolet',
 16: 'Great Wall',
 4: 'Buick',
 37: 'Pontiac',
 29: 'Lincoln',
 18: 'Hyundai',
 34: 'Nissan',
 46: 'Suzuki',
 3: 'BMW',
 30: 'Mazda',
 26: 'Land Rover',
 20: 'Iveco',
 43: 'Skoda',
 41: 'Saab',
 19: 'Infiniti',
 6: 'Chery',
 17: 'Honda',
 31: 'Mercedes-Benz',
 36: 'Peugeot',
 8: 'Chrysler'}

In [None]:
# we can use the code book to convert the codes back to their original categorical values
used_cars["manufacturer_code"].map(name_map)

0          Subaru
1          Subaru
2          Subaru
3          Subaru
4          Subaru
           ...   
38526    Chrysler
38527    Chrysler
38528    Chrysler
38529    Chrysler
38530    Chrysler
Name: manufacturer_code, Length: 38531, dtype: object

### Boolean Encoding
When creating a label encoding for a categorical column, it is common to create a Boolean code that represents a group of categories.

In [None]:
# For example, say we wanted to create a boolean code for all cars that were vans. 
# We can use the NumPy function `where()` to say anytime this statement is true, we want to have a value of `1`, 
# and anytime this statement is false we want to have a value of `0`. 

print(used_cars["body_type"].value_counts(dropna=False), '\n')

# used_cars['is_van'] = np.where(used_cars["body_type"]=='van', 1, 0)
used_cars["is_van"].value_counts(dropna=False)
# Looking at the output, only about 4,400 of the 38,000 used cars have van in their body type name.

body_type
sedan        13011
hatchback     7644
universal     5507
suv           5164
minivan       3608
minibus       1369
van            808
coupe          652
liftback       552
pickup         129
cabriolet       75
limousine       12
Name: count, dtype: int64 



is_van
0    34115
1     4416
Name: count, dtype: int64

## One-hot Encoding
It's better than label encoding in case we want to train a machine learning model on our encoded data as one-hot encoding avoids ordinality where it prevents the algorithm from assuming any ordinal relationship between the categories.
#### What is One-hot Encoding?
In the boolean encoding cell, we created a zero-one column, `used_cars[is_van]`, for a single value, "van", of a single column, `used_cars["body_type"]`. Fortunately, we can do this for all values of a single column, or even all columns at one time using `pd.get_dummies()`. One-hot encoding is the process of creating dummy variables.
#### How to create One-hot Encoding?
`pd.get_dummies()`
-   `data`: a pandas DataFrame
-   `columns`: list of column names that will be one-hot encoded
-   `prefix`: a string to be added to the beginning of each feature (category. e.g., color)
#### A Few Quick Notes
-   There are a few things to consider when using one-hot encoding. First, if your columns have a lot of unique values, an equal amount of new columns will be created. **Training machine learning models on a lot of columns may lead to a problem known as overfitting**, something we would like to avoid.

		-  Although you won't be able to use categorical columns that have not been encoded, limiting which categorical columns are included in your algorithm may prevent overfitting and allow you to predict future events quickly

-   NaN values do not get their own column. This is OK though. for a cell value in a certain row, if all created columns are 0, this indicates that the original cell value was missing. There is no need to have a column for missing values.
___
### Overfitting
##### Key Characteristics of Overfitting
1. **High Training Accuracy, Low Test Accuracy**:
	- The model shows very high accuracy on training data but significantly lower accuracy on test data, indicating it hasn't generalized well.
2. **Complexity**:
	- Overfitted models are often too complex, with too many parameters relative to the amount of training data. This complexity allows the model to fit the training data very closely, including its noise.
3. **Variance**:
	- Overfitting leads to high variance in the model’s predictions. This means the model’s performance is highly sensitive to small changes in the training data.

##### Causes of Overfitting
1. **Too Much Model Complexity**:
	- Using a model that is too complex (e.g., too many features, layers, or parameters) relative to the amount of data can cause overfitting.
2. **Insufficient Training Data**:
	- Having too little data for the model to learn from can lead to overfitting, as the model may capture noise in the data rather than general patterns.
3. **Noisy Data**:
	- If the training data contains a lot of noise (random errors or outliers), the model may learn these anomalies instead of the actual underlying patterns (trending, general, **real** patterns).

##### Preventing Overfitting
1. **Simplifying the Model**:
	- Use a less complex model with fewer parameters to prevent it from capturing noise in the training data.
2. **Regularization**:
	- Techniques like L1 (Lasso) and L2 (Ridge) regularization add a penalty to the model's complexity, discouraging it from fitting the noise.
3. **Cross-Validation**:
	- Use cross-validation to ensure the model performs well on different subsets of the data, helping to assess its ability to generalize.
4. **Early Stopping**:
	- During training, monitor the model's performance on a validation set and stop training when the performance starts to degrade, indicating potential overfitting.
5. **Increasing Training Data**:
	- More data can help the model learn more generalizable patterns and reduce the impact of noise.
6. **Data Augmentation**:
	- For certain types of data (e.g., images), augmenting the data (e.g., through rotations, translations) can effectively increase the size of the training dataset.

##### Example
Suppose you are training a polynomial regression model to fit a set of data points. A high-degree polynomial might fit the training data perfectly, capturing every little fluctuation, but it might fail to predict new data points accurately because it has learned the noise as well. In contrast, a lower-degree polynomial might capture the general trend of the data without overfitting.

In [None]:
print(used_cars[["odometer_value", "color"]].shape)
print(used_cars[["odometer_value", "color"]].head(), '\n\n')

# used_cars_onehot will now have all object and categorical columns one-hot encoded. 
# Any numeric columns will remain the same. Color had twelve unique color values, so we now have one column per color. 
# A 0 indicates that the car was not that color, while a 1 indicates that the car was that color. 
# Our new DataFrame will have 13 total columns. One for the odometer value and 12 for the new color columns. 
# The original color column is dropped.
used_cars_onehot = pd.get_dummies(data=used_cars[["odometer_value", "color"]])
print(used_cars_onehot.shape)
used_cars_onehot.head()

(38531, 2)
   odometer_value   color
0          190000  silver
1          290000    blue
2          402000     red
3           10000    blue
4          280000   black 


(38531, 13)


Unnamed: 0,odometer_value,color_black,color_blue,color_brown,color_green,color_grey,color_orange,color_other,color_red,color_silver,color_violet,color_white,color_yellow
0,190000,False,False,False,False,False,False,False,False,True,False,False,False
1,290000,False,True,False,False,False,False,False,False,False,False,False,False
2,402000,False,False,False,False,False,False,False,True,False,False,False,False
3,10000,False,True,False,False,False,False,False,False,False,False,False,False
4,280000,True,False,False,False,False,False,False,False,False,False,False,False


In [None]:
print(used_cars.shape, '\n\n')

# Specifying columns to use
# It may be important to use `pd.get_dummies()` on only a subset of columns, as you may not want to encode all object or categorical variables.
# Since we are only doing one column, we have decided to set the prefix to blank.
# All other columns in the used cars dataset will be left alone, but the color column will be converted to twelve columns, one for each color.
used_cars_onehot = pd.get_dummies(data=used_cars, columns=["color"], prefix="")
print(used_cars_onehot.shape)
used_cars_onehot.head()
# N.B. We may also set the prefix of all new columns to "dummy", so that we can easily filter newly created columns

(38531, 32) 


(38531, 43)


Unnamed: 0,manufacturer_name,model_name,transmission,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,...,_brown,_green,_grey,_orange,_other,_red,_silver,_violet,_white,_yellow
0,Subaru,Outback,automatic,190000,2010,gasoline,False,gasoline,2.5,universal,...,False,False,False,False,False,False,True,False,False,False
1,Subaru,Outback,automatic,290000,2002,gasoline,False,gasoline,3.0,universal,...,False,False,False,False,False,False,False,False,False,False
2,Subaru,Forester,automatic,402000,2001,gasoline,False,gasoline,2.5,suv,...,False,False,False,False,False,True,False,False,False,False
3,Subaru,Impreza,mechanical,10000,1999,gasoline,False,gasoline,3.0,sedan,...,False,False,False,False,False,False,False,False,False,False
4,Subaru,Legacy,automatic,280000,2001,gasoline,False,gasoline,2.5,universal,...,False,False,False,False,False,False,False,False,False,False
