# 1. One-Hot Encoding

One-Hot Encoding: One-Hot Encoding converts categorical variables into binary indicators, allowing them to be used by machine learning models.

In [21]:
import pandas as pd

data = {
    'Color': ['Red', 'Blue', 'Green', 'Blue']
}
df = pd.DataFrame(data)

In [22]:
df

Unnamed: 0,Color
0,Red
1,Blue
2,Green
3,Blue


In [23]:
df_encoded = pd.get_dummies(
    df,
    columns=['Color'],
    prefix='Color'
)
df_encoded

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,False,False,True
1,True,False,False
2,False,True,False
3,True,False,False


# 2. Binning

Binning: Binning transforms continuous variables into discrete bins, making them categorical for easier analysis.

In [24]:
import pandas as pd

data = {
    'Age': [23, 45, 18, 34, 67, 50, 21]
}
df = pd.DataFrame(data)

In [25]:
df

Unnamed: 0,Age
0,23
1,45
2,18
3,34
4,67
5,50
6,21


In [26]:
bins = [0, 20, 40, 60, 100]
labels = ['0-20', '21-40', '41-60', '61+']

In [27]:
df['Age_Group'] = pd.cut(
    df['Age'],
    bins=bins,
    labels=labels,
    right=False
)
df

Unnamed: 0,Age,Age_Group
0,23,21-40
1,45,41-60
2,18,0-20
3,34,21-40
4,67,61+
5,50,41-60
6,21,21-40


# 3. Text Data Preprocessing

Text Data Preprocessing: Involves removing stop-words, stemming and vectorizing text data to prepare it for machine learning models.

In [28]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
texts = [
    "This is a sample sentence.",
    "Text data preprocessing is important."
]

In [30]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [31]:
stemmer = PorterStemmer()
stemmer

<PorterStemmer>

In [32]:
vectorizer = CountVectorizer()
vectorizer

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [33]:
def preprocess_text(text):
    words = text.split()
    words = [
        stemmer.stem(word) for word in words if word.lower() not in stop_words
    ]
    return  " ".join(words)

In [34]:
cleaned_texts = [
    preprocess_text(text) for text in texts
]

cleaned_texts

['sampl sentence.', 'text data preprocess important.']

In [35]:
X = vectorizer.fit_transform(cleaned_texts)

X.toarray()

array([[0, 0, 0, 1, 1, 0],
       [1, 1, 1, 0, 0, 1]])

# 4. Feature Splitting

Feature Splitting: Divides a single feature into multiple sub-features, uncovering valuable insights and improving model performance.

In [36]:
import pandas as pd

data = {
    'Full_Address': [
        '123 Elm St, Springfield, 12345',
        '456 Oak Rd, Shelbyville, 67890'
    ]
}

df = pd.DataFrame(data)

In [37]:
df[['Street', 'City', 'Zipcode']] = df['Full_Address'].str.extract(
    r'([0-9]+\s[\w\s]+),\s([\w\s]+),\s(\d+)'
)

df

Unnamed: 0,Full_Address,Street,City,Zipcode
0,"123 Elm St, Springfield, 12345",123 Elm St,Springfield,12345
1,"456 Oak Rd, Shelbyville, 67890",456 Oak Rd,Shelbyville,67890
