<a href="https://colab.research.google.com/github/LorraineWong/WQD7005-Data-Mining-S2152880/blob/main/Lab_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **🧪 Lab 1: Running LLMs on Free GPUs using Google Colab & Hugging Face Spaces**

In [None]:
# 3.Enable GPU Click Runtime → Change Runtime Type and select GPU and click Save
# 4.Install Required Libraries Open a new code cell and run
!pip install torch torchvision torchaudio transformers accelerate
!pip install bitsandbytes

In [None]:
import os
import torch
from huggingface_hub import notebook_login

notebook_login()
# os.environ["HUGGINGFACE_TOKEN"] = "your token"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.1"
# Load model with 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Pass the Hugging Face token to from_pretrained using use_auth_token
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

In [None]:
# Move model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
# Define prompt
prompt = "What is artificial intelligence?"

In [None]:
# Tokenize input and move to same device as the model
inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [None]:
# Generate response
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0]))

# **🧪 Lab 2: Generative Models for Data Augmentation**

In [None]:
# Lab 2: Applying Generative Models for Data Augmentation in Data Mining

#Step 1: Install Latest SDV
!pip install sdv

In [None]:
# Step 2: Use the correct import for CTGAN (in SDV 1.x)
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd
!wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

In [None]:
# Step 3: Generate synthetic data using updated CTGAN usage
# Load your dataset
df = pd.read_csv('titanic.csv')
df = df[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']].dropna()
df['Sex'] = df['Sex'].astype(str)

# Define metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)

# Train CTGAN
model = CTGANSynthesizer(metadata)
model.fit(df)

# Generate synthetic data
synthetic_data = model.sample(1000)
print(synthetic_data.head())

# **🧪 Lab 3: Real-Time Data Processing with Small Language Models (SLMs)**

In [None]:
# Step 1: Load a Small Language Model
from transformers import pipeline

# Use a lightweight sentiment analysis model
slm_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
# Step 2: Simulate Real-Time Data Stream
import time
data_stream = [
    "The delivery was really fast and efficient!",
    "I am not happy with the product quality.",
    "Excellent customer service.",
    "The website crashed multiple times.",
    "Totally worth the price!"
]

for entry in data_stream:
    result = slm_pipeline(entry)
    print(f"Input: {entry}\nSentiment: {result[0]['label']} (Score: {result[0]['score']:.2f})\n")
time.sleep(2)  # Simulate streaming delay

In [None]:
# Step 3: Aggregate Sentiments in Real-Time
positive = 0
negative = 0

for entry in data_stream:
    result = slm_pipeline(entry)
    sentiment = result[0]['label']
    if sentiment == 'POSITIVE':
        positive += 1
    else:
        negative += 1

print(f"Positive: {positive}, Negative: {negative}")

# **🧪 Lab 4: Extracting Business Insights with Quantized Mistral 7B LLM**

In [None]:
# STEP 1: Install Required Libraries
!pip install torch torchvision torchaudio transformers accelerate bitsandbytes --quiet

In [None]:
# STEP 2: Setup Hugging Face Token
import os
import torch

# Replace "your_token_here" with your Hugging Face access token
# os.environ["HUGGINGFACE_TOKEN"] = "your_token_here"

# from huggingface_hub import login
# login(token=os.environ["HUGGINGFACE_TOKEN"])

from huggingface_hub import notebook_login
notebook_login()

In [None]:
# STEP 3: Load Mistral 7B with 4-bit Quantization
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.1"
# Load model with 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Pass the Hugging Face token to from_pretrained using use_auth_token
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)

In [None]:
# STEP 4: Move Model to GPU (if available)
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
# STEP 5: Define Business Case
business_case = """
Our retail company is expanding rapidly and we need better visibility into sales performance across different regions.
We are currently using separate systems in each region and struggling to generate consolidated reports.
Our main goal is to improve decision-making by integrating all sales data into one centralized warehouse.
We also want to track KPIs like regional sales growth, top-performing products, and customer retention rates.
This project is sponsored by the Sales Director and supported by the Regional Managers.
"""

In [None]:
# STEP 6: Craft the Prompt
prompt = f"""
You are a business analyst assistant. Given the following business case, extract:

1. Business Goals
2. Key Stakeholders
3. Important KPIs

Business Case:

{business_case}
"""

In [None]:
# STEP 7: Generate LLM Response
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
outputs = model.generate(**inputs, max_new_tokens=300)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# STEP 8: Print the Response
print("LLM Extracted Output:\n")
print(response)

# **🧪 Lab 5: Using Hugging Face Transformers to Generate SQL for Star Schema Design**

In [None]:
# Step 1: Install Required Libraries
# Step 2: Setup Hugging Face Token
# STEP 3: Load Mistral 7B with 4-bit Quantization
# STEP 4: Move Model to GPU (Optional)

In [None]:
# Step 5: Create a Pipeline for Text Generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
# STEP 6: Prompt the Model
prompt = """
Design a star schema for a sales analytics data warehouse.
Include SQL DDL statements to create the tables: sales_fact, product_dim, customer_dim, time_dim, and store_dim.
Use PostgreSQL syntax.
"""

In [None]:
# STEP 7: Generate and Print the Output
output = generator(prompt, max_new_tokens=512, temperature=0.5, do_sample=True)

print(" Star Schema SQL Output:\n")
print(output[0]['generated_text'])

# **🧪 Lab 6: Using Mistral 7B to Detect Data Quality Issues**

In [None]:
# STEP 1: Install Required Libraries
!pip install torch torchvision torchaudio transformers accelerate bitsandbytes --quiet
!pip install -U bitsandbytes

In [None]:
# Step 2: Setup Hugging Face Token
# STEP 3: Load Mistral 7B with 4-bit Quantization

In [None]:
# STEP 4: Create Sample Dataset
import pandas as pd

data = {
    'CustomerID': [1001, 1002, 1003, None, 1005],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Email': ['alice@gmail', 'bob@yahoo.com', 'charlie@@mail.com', None, 'eve@gmail.com'],
    'JoinDate': ['2021-01-01', '2021-02-30', '2021-03-15', 'bad_date', '2021/04/01'],
    'Country': ['MY', 'Malaysia', 'MY', 'Singapore', None],
    'Revenue': ['1000', 'Two Thousand', '3000', '-500', '4000']
}
df = pd.DataFrame(data)

In [None]:
# STEP 5: Convert Schema into Prompt
schema = str(df.dtypes)

prompt = f"""
You are a data quality expert. Analyze the schema below and:

1. Identify 3 potential data quality issues.
2. Suggest data profiling steps.
3. Recommend data standardization or validation actions.

Schema:

{schema}
"""

In [None]:
# STEP 6: Tokenize Input and Run Model
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=300)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response[len(prompt):].strip())

# **🧪 Lab 7: Handling Missing Text Data with Transformer-based Small Language Models (SLMs)**

In [None]:
# Step 1: Setup Colab Environment
!pip install transformers missingno

In [None]:
# Step 2: Create a Dataset with Missing Values
import pandas as pd
import numpy as np

# Example dataset with missing textual data
data = {
    'Review': ['This place is wonderful!', np.nan, 'I feel', 'Service was', np.nan],
    'Sentiment': ['positive', 'neutral', np.nan, 'negative', 'neutral']
}

df = pd.DataFrame(data)
df

In [None]:
# Step 3: Visualize Missing Data
import missingno as msno
msno.matrix(df)

In [None]:
# Step 4: Load a Small Language Model (SLMs)
from transformers import pipeline

# Load a small language model (BERT-tiny)
fill_mask = pipeline('fill-mask', model='prajjwal1/bert-tiny')

In [None]:
# Step 5: Demonstrate Simple Masked Prediction
text = "The service here is [MASK]."
predictions = fill_mask(text)
predictions

In [None]:
# Step 6: Impute Missing Values using the SLMs
def slm_impute(text, sentiment, slm_pipeline):
    if pd.isna(text):
        prompt = f"The sentiment of this review is {sentiment}, it is [MASK]."
        prediction = slm_pipeline(prompt)[0]['token_str']
        return prediction
    return text

# Apply imputation to Review
df['Review'] = df.apply(lambda row: slm_impute(row['Review'], row['Sentiment'], fill_mask), axis=1)
df

In [None]:
df['Sentiment'] = df.apply(lambda row: slm_impute(row['Sentiment'], row['Review'], fill_mask), axis=1)
df