# One Hot Encoding

#### Creating a sinthetic dataset for testing purposes

In [1]:
import pandas as pd

In [None]:


data = [
    ("I love this product so much!", "positive"),
    ("The service was absolutely terrible.", "negative"),
    ("It's okay, not the best but not the worst.", "neutral"),
    ("I’m really happy with my experience.", "positive"),
    ("This is the worst thing I’ve ever bought.", "negative"),
    ("Totally satisfied with the purchase!", "positive"),
    ("I wouldn’t recommend this to anyone.", "negative"),
    ("It was fine, just not what I expected.", "neutral"),
    ("Amazing quality and fast shipping.", "positive"),
    ("I hate how slow the response was.", "negative"),
    ("Everything went smoothly and easily.", "positive"),
    ("It didn’t meet my expectations.", "negative"),
    ("Meh, it’s alright I guess.", "neutral"),
    ("Fantastic support from the team!", "positive"),
    ("Terrible experience, never again.", "negative"),
    ("I feel indifferent about the results.", "neutral"),
    ("Exceeded all my expectations!", "positive"),
    ("Very poor performance for the price.", "negative"),
    ("It works, but nothing special.", "neutral"),
    ("Absolutely love the new update!", "positive"),
]

df = pd.DataFrame(data, columns=["text", "sentiment"])
df.to_csv("sentiment_data.csv", index=False)


In [2]:
sentiment_data = pd.read_csv("C:\\GEN AI Course\\Gen-AI-Learning\\ML for NLP\\Vectorization\\sentiment_data.csv")

In [3]:
sentiment_data

Unnamed: 0,text,sentiment
0,I love this product so much!,positive
1,The service was absolutely terrible.,negative
2,"It's okay, not the best but not the worst.",neutral
3,I’m really happy with my experience.,positive
4,This is the worst thing I’ve ever bought.,negative
5,Totally satisfied with the purchase!,positive
6,I wouldn’t recommend this to anyone.,negative
7,"It was fine, just not what I expected.",neutral
8,Amazing quality and fast shipping.,positive
9,I hate how slow the response was.,negative


#### Creating the vocabulary from the text data

In [33]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [76]:
my_vocab = set()
for sentence in sentiment_data['text']:
    words = tokenizer.tokenize(sentence.lower())
    # print(f"Tokenized words: {words}")
 
    my_vocab.update(words)
    # print(f"Current vocabulary size: {len(my_vocab)}")
    # print(f"Current vocabulary: {my_vocab}")


In [77]:
len(my_vocab)     # our vocabulary size

85

#### Tokenizing the vocabulary

In [82]:
print(my_vocab)

{'my', 'exceeded', 'thing', 'the', 'worst', 'guess', 'happy', 'just', 'slow', 'it', 'hate', ',', 'amazing', 'response', 'new', 'i’ve', 'feel', "'s", 'terrible', 'quality', '.', 'everything', 'special', 'not', 'it’s', 'meh', 'nothing', 'bought', 'so', 'went', 'was', 'easily', 'but', 'performance', 'how', 'again', 'meet', 'team', 'product', 'best', 'purchase', 'service', 'satisfied', 'totally', 'for', 'is', 'shipping', 'all', 'expected', 'i', 'about', 'support', 'ever', 'anyone', 'alright', 'update', 'indifferent', 'works', 'experience', 'with', 'price', 'much', 'fine', 'this', 'smoothly', 'results', 'fantastic', 'poor', 'never', 'love', 'from', 'fast', 'very', 'and', 'didn’t', '!', 'wouldn’t', 'recommend', 'okay', 'i’m', 'expectations', 'to', 'what', 'absolutely', 'really'}


In [81]:
token = [ [word] for word in my_vocab]      #Converting each word into a list so that we can use it in the multidimensional binarizer/encoding
print(len(token))
print(token)

85
[['my'], ['exceeded'], ['thing'], ['the'], ['worst'], ['guess'], ['happy'], ['just'], ['slow'], ['it'], ['hate'], [','], ['amazing'], ['response'], ['new'], ['i’ve'], ['feel'], ["'s"], ['terrible'], ['quality'], ['.'], ['everything'], ['special'], ['not'], ['it’s'], ['meh'], ['nothing'], ['bought'], ['so'], ['went'], ['was'], ['easily'], ['but'], ['performance'], ['how'], ['again'], ['meet'], ['team'], ['product'], ['best'], ['purchase'], ['service'], ['satisfied'], ['totally'], ['for'], ['is'], ['shipping'], ['all'], ['expected'], ['i'], ['about'], ['support'], ['ever'], ['anyone'], ['alright'], ['update'], ['indifferent'], ['works'], ['experience'], ['with'], ['price'], ['much'], ['fine'], ['this'], ['smoothly'], ['results'], ['fantastic'], ['poor'], ['never'], ['love'], ['from'], ['fast'], ['very'], ['and'], ['didn’t'], ['!'], ['wouldn’t'], ['recommend'], ['okay'], ['i’m'], ['expectations'], ['to'], ['what'], ['absolutely'], ['really']]


#### Using MultiLabelBinarizer to create a multilabel binarizer/encoding for the sentiment data

In [110]:
from sklearn.preprocessing import OneHotEncoder

mlb = OneHotEncoder()
encoded = mlb.fit(token)

print(mlb.categories_[0])  
print(len(mlb.categories_))  # number of unique words in the vocabulary
encoded

['!' "'s" ',' '.' 'about' 'absolutely' 'again' 'all' 'alright' 'amazing'
 'and' 'anyone' 'best' 'bought' 'but' 'didn’t' 'easily' 'ever'
 'everything' 'exceeded' 'expectations' 'expected' 'experience'
 'fantastic' 'fast' 'feel' 'fine' 'for' 'from' 'guess' 'happy' 'hate'
 'how' 'i' 'indifferent' 'is' 'it' 'it’s' 'i’m' 'i’ve' 'just' 'love'
 'meet' 'meh' 'much' 'my' 'never' 'new' 'not' 'nothing' 'okay'
 'performance' 'poor' 'price' 'product' 'purchase' 'quality' 'really'
 'recommend' 'response' 'results' 'satisfied' 'service' 'shipping' 'slow'
 'smoothly' 'so' 'special' 'support' 'team' 'terrible' 'the' 'thing'
 'this' 'to' 'totally' 'update' 'very' 'was' 'went' 'what' 'with' 'works'
 'worst' 'wouldn’t']
1


0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


#### Creating a DataFrame with the encoded vocabulary

In [126]:
token

[['my'],
 ['exceeded'],
 ['thing'],
 ['the'],
 ['worst'],
 ['guess'],
 ['happy'],
 ['just'],
 ['slow'],
 ['it'],
 ['hate'],
 [','],
 ['amazing'],
 ['response'],
 ['new'],
 ['i’ve'],
 ['feel'],
 ["'s"],
 ['terrible'],
 ['quality'],
 ['.'],
 ['everything'],
 ['special'],
 ['not'],
 ['it’s'],
 ['meh'],
 ['nothing'],
 ['bought'],
 ['so'],
 ['went'],
 ['was'],
 ['easily'],
 ['but'],
 ['performance'],
 ['how'],
 ['again'],
 ['meet'],
 ['team'],
 ['product'],
 ['best'],
 ['purchase'],
 ['service'],
 ['satisfied'],
 ['totally'],
 ['for'],
 ['is'],
 ['shipping'],
 ['all'],
 ['expected'],
 ['i'],
 ['about'],
 ['support'],
 ['ever'],
 ['anyone'],
 ['alright'],
 ['update'],
 ['indifferent'],
 ['works'],
 ['experience'],
 ['with'],
 ['price'],
 ['much'],
 ['fine'],
 ['this'],
 ['smoothly'],
 ['results'],
 ['fantastic'],
 ['poor'],
 ['never'],
 ['love'],
 ['from'],
 ['fast'],
 ['very'],
 ['and'],
 ['didn’t'],
 ['!'],
 ['wouldn’t'],
 ['recommend'],
 ['okay'],
 ['i’m'],
 ['expectations'],
 ['to'],
 ['

In [129]:
encoded_df = pd.DataFrame( columns=mlb.categories_[0])   # Initialize an empty DataFrame with the classes as columns and columns are the unique words in the vocabulary
for sentence in sentiment_data['text']:
    words = sentence.lower().split()
    print(f"Current sentence's words: {words}")
    
    # Create a new row with the tokenized words
    new_row = pd.Series([1 if word[0] in words else 0 for word in token], index=encoded_df.columns )
    # print(f"New row: {new_row}")
    encoded_df = pd.concat([encoded_df, new_row.to_frame().T], ignore_index=True)

    

Current sentence's words: ['i', 'love', 'this', 'product', 'so', 'much!']
Current sentence's words: ['the', 'service', 'was', 'absolutely', 'terrible.']
Current sentence's words: ["it's", 'okay,', 'not', 'the', 'best', 'but', 'not', 'the', 'worst.']
Current sentence's words: ['i’m', 'really', 'happy', 'with', 'my', 'experience.']
Current sentence's words: ['this', 'is', 'the', 'worst', 'thing', 'i’ve', 'ever', 'bought.']
Current sentence's words: ['totally', 'satisfied', 'with', 'the', 'purchase!']
Current sentence's words: ['i', 'wouldn’t', 'recommend', 'this', 'to', 'anyone.']
Current sentence's words: ['it', 'was', 'fine,', 'just', 'not', 'what', 'i', 'expected.']
Current sentence's words: ['amazing', 'quality', 'and', 'fast', 'shipping.']
Current sentence's words: ['i', 'hate', 'how', 'slow', 'the', 'response', 'was.']
Current sentence's words: ['everything', 'went', 'smoothly', 'and', 'easily.']
Current sentence's words: ['it', 'didn’t', 'meet', 'my', 'expectations.']
Current sent

In [130]:
encoded_df

Unnamed: 0,!,'s,",",.,about,absolutely,again,all,alright,amazing,...,totally,update,very,was,went,what,with,works,worst,wouldn’t
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
