# # Label Encoding Tutorial


### Pandas is used for working with data in tables (DataFrames)

In [30]:
import pandas as pd

#### Create a simple DataFrame with names to demonstrate label encoding
#### This is our first example with basic fruit and people names

In [None]:
ds = pd.DataFrame({"Names": ["Zafran", "Hilal", "Cat", "Banana", "Wood", "Apple", "Hasnain", "Carrot", "Cricket", "Pakistan"]})

In [32]:
ds

Unnamed: 0,Names
0,Zafran
1,Hilal
2,Cat
3,Banana
4,Wood
5,Apple
6,Hasnain
7,Carrot
8,Cricket
9,Pakistan


### LabelEncoder converts text categories into numbers (0, 1, 2, 3, ...)

In [33]:
from sklearn.preprocessing import LabelEncoder

### fit_transform: Learns the categories and converts them to numbers in one step
### Output shows: each unique name gets a unique number

In [34]:
le = LabelEncoder()
le.fit_transform(ds["Names"])

array([9, 6, 3, 1, 8, 0, 5, 2, 4, 7])

 #### fit_transform: Same as above, but now we save the results in a new column called "en_names"
#### en_names = encoded names

In [35]:
le = LabelEncoder()
ds["en_names"]= le.fit_transform(ds["Names"])


In [36]:
ds

Unnamed: 0,Names,en_names
0,Zafran,9
1,Hilal,6
2,Cat,3
3,Banana,1
4,Wood,8
5,Apple,0
6,Hasnain,5
7,Carrot,2
8,Cricket,4
9,Pakistan,7


### Read a CSV file called "Hitters.csv" and display first 10 rows
### This dataset contains baseball player information

In [42]:
dataset = pd.read_csv("Hitters.csv")
dataset.head(10)

Unnamed: 0.1,Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,-Andy Allanson,293,66,1,30,29,14,1,293,66,...,30,29,14,A,E,446,33,20,,A
1,-Alan Ashby,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,W,632,43,10,475.0,N
2,-Alvin Davis,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,W,880,82,14,480.0,A
3,-Andre Dawson,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,E,200,11,3,500.0,N
4,-Andres Galarraga,321,87,10,39,42,30,2,396,101,...,48,46,33,N,E,805,40,4,91.5,N
5,-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,...,501,336,194,A,W,282,421,25,750.0,A
6,-Al Newman,185,37,1,23,8,21,2,214,42,...,30,9,24,N,E,76,127,7,70.0,A
7,-Argenis Salazar,298,73,0,24,24,7,3,509,108,...,41,37,12,A,W,121,283,9,100.0,A
8,-Andres Thomas,323,81,6,26,32,8,2,341,86,...,32,34,8,N,W,143,290,19,75.0,N
9,-Andre Thornton,401,92,17,49,66,65,13,5206,1332,...,784,890,866,A,E,0,0,0,1100.0,A


### Create a new LabelEncoder object
### fit(): Learn all unique values in the "League" column (e.g., "American", "National")

In [53]:
la = LabelEncoder()

la.fit(dataset["League"])

#### Same as Cell 9 - learning the unique league names before encoding
#### Note: Use square brackets [] to select a column, not curly braces {}

In [55]:
la = LabelEncoder()
la.fit(dataset["League"])  # Using square brackets instead of curly braces

### transform(): Convert the league names to numbers
### Example: "American" might become 0, "National" might become 1
### Note: This doesn't save the changes yet

In [45]:
la.transform(dataset["League"])

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,

# Finding Unique values
#### unique(): Show all different league names in the original data
#### Before encoding: you see the actual text values

In [47]:
dataset["League"].unique()

array(['A', 'N'], dtype=object)

### Now we save the encoded numbers back to the "League" column
### The text values are replaced with their numerical equivalents

In [56]:
dataset["League"] = la.transform(dataset["League"])

## Unique values labeled
### Now the column contains numbers instead of text
### After encoding: you see the numerical values (0, 1, etc.)

In [60]:
dataset["League"].unique()

array([0, 1])