# Implementation: Advanced Tricks by Data Type

We will implement specific tricks for Tabular, Text, Image, and Time Series data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

## 1. Tabular: Memory Optimization

In [None]:
df = pd.DataFrame({'nums': np.random.randint(0, 100, 1000000)})
print(f"Original Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

# Downcast to int8
df['nums'] = df['nums'].astype('int8')
print(f"Optimized Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

## 2. Text: Bag of Words (Tokenization)

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
]
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("Encoded Matrix:\n", X.toarray())

## 3. Image: Normalization & Flattening

In [None]:
# Simulate 8x8 grayscale image
img = np.random.randint(0, 255, (8, 8))
plt.imshow(img, cmap='gray')
plt.show()

# Normalize
img_norm = img / 255.0
# Flatten
img_flat = img_norm.flatten()
print(f"Original Shape: {img.shape}, Flattened Shape: {img_flat.shape}")

## 4. Time Series: Lag Features

In [None]:
ts = pd.DataFrame({'Sales': [100, 110, 115, 120, 125]})
ts['Lag_1'] = ts['Sales'].shift(1)

print("Lagged DataFrame (Use yesterday's sales to predict today):")
display(ts)