In [3]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('/kaggle/input/data-science-interview-q-and-a-treasury/dataset.csv')

def clean_text(text):
    if pd.isna(text):
        return text
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to 'question' and 'answer' columns
df['question_clean'] = df['question'].apply(clean_text)
df['answer_clean'] = df['answer'].apply(clean_text)

# Filter out rows where answer is empty or contains placeholder text
placeholder_pattern = r'^(Answer here)?$'
df_cleaned = df[
    df['answer_clean'].notna() &
    ~df['answer_clean'].str.match(placeholder_pattern, case=False)
].copy()

# Drop the original columns and rename cleaned ones
df_cleaned = df_cleaned[['question_clean', 'answer_clean']]
df_cleaned.columns = ['question', 'answer']


In [5]:
df_cleaned.head(100)

Unnamed: 0,question,answer
0,What is supervised machine learning?,Supervised learning is a type of machine learn...
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...
2,What is linear regression? When do we use it?,Linear regression is a model that assumes a li...
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...
4,Whats the normal distribution? Why do we care ...,The normal distribution is a continuous probab...
...,...,...
99,What is Adam? Whats the main difference betwee...,Adam (Adaptive Moment Estimation) is a optimiz...
100,When would you use Adam and when SGD?,"Adam tends to converge faster, while SGD often..."
101,Do we want to have a constant learning rate or...,"Generally, it is recommended to start learning..."
102,How do we decide when to stop training a neura...,Simply stop training when the validation error...


In [6]:
df_cleaned.to_csv('questions_answers_cleaned.csv', index=False)


In [7]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# 1. Load your data
df = pd.read_csv('/kaggle/working/questions_answers_cleaned.csv')  # or wherever you saved

# 2. Define a super‐cleaner function
def deep_clean(text):
    if pd.isna(text):
        return ''
    # 2a. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # 2b. Remove Markdown links and images: ![alt](url) and [text](url)
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    # 2c. Strip code fences and inline code
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    # 2d. Remove bold/italic markers
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    # 2e. Remove list bullets and numeric prefixes
    text = re.sub(r'^[\-\*\d]+\.\s+', '', text, flags=re.MULTILINE)
    # 2f. Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 3. Apply to the answer column
df['answer_final'] = df['answer'].apply(deep_clean)

# 4. (Optional) Do the same for questions if needed
df['question_final'] = df['question'].apply(deep_clean)

# 5. Keep only non‐empty answers
df = df[df['answer_final'] != '']

# 6. Select your final columns
df = df[['question_final', 'answer_final']].rename(
    columns={'question_final':'question','answer_final':'answer'}
)

# 7. Save out
df.to_csv('qa_preprocessed.csv', index=False)
