In [None]:
!pip install underthesea

In [None]:
import pandas as pd

# Load input files
corpus_df = pd.read_csv('/kaggle/input/datanlpnew/corpus.csv')  # columns: 'cid', 'text'
train_df = pd.read_csv('/kaggle/input/datanlpnew/train.csv')    # 'cid' column contains lists like "[31682 31677]"

# Convert corpus to dict for fast lookup
cid_to_text = dict(zip(corpus_df['cid'], corpus_df['text']))

# Parse cid string like "[31682 31677 12345]"
def parse_cid_list(cid_str):
    if isinstance(cid_str, str):
        return [int(x) for x in cid_str.strip("[]").split()]
    return [int(cid_str)]

# Collect all unique valid cids
all_cids = []
for raw_cid in train_df['cid']:
    parsed_cids = parse_cid_list(raw_cid)
    for cid in parsed_cids:
        if cid in cid_to_text:
            all_cids.append(cid)

# Deduplicate
unique_valid_cids = sorted(set(all_cids))

# Build final DataFrame
train_corpus_df = pd.DataFrame({
    'cid': unique_valid_cids,
    'context': [cid_to_text[cid] for cid in unique_valid_cids]
})

# Save to file
train_corpus_df.to_csv('train_corpus.csv', index=False)

# Print stats
print(f"Total questions in train.csv: {len(train_df)}")
print(f"Total cid references (including duplicates): {len(all_cids)}")
print(f"Unique valid cids: {len(unique_valid_cids)}")


In [None]:
#Cleaning
import regex as re
import pandas as pd

def clean_text(text):
    """ Cleans text by removing newlines, unwanted characters, normalizing spaces, and trimming. """
    if not isinstance(text, str):
        return ""

    text = text.replace('\n', ' ')  # ✅ Remove newlines first
    text = text.lower()
    text = re.sub(r'[\xa0\xad]+', ' ', text)
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'…+', '', text)
    text = re.sub(r'[@#%^&*]+', '', text)
    text = re.sub(r' {2,}', ' ', text)
    text = re.sub(r'-{2,}', '', text)
    text = re.sub(r'_{4,}', '', text)
    text = re.sub(r':\.*', '', text)
    return text.strip()

def clean_id(cid):
    """ Cleans 'cid' and 'qid' columns: removes brackets and normalizes spaces. """
    cid = re.sub(r'[\[\]]', '', cid)
    cid = re.sub(r'\s+', ' ', cid).strip()
    return ','.join(cid.split())

def remove_punctuation_numbers(text):
    """ Removes punctuation and numbers while keeping letters. """
    if not isinstance(text, str):
        return ""

    text = re.sub(r'[^\p{L}\s]', '', text)
    text = re.sub(r' {2,}', ' ', text).strip()
    return text

def filter_words(text):
    """ Removes words shorter than 2 or longer than 7 characters and those with duplicate consecutive characters. """
    if not isinstance(text, str):
        return ""

    words = text.split()
    filtered = []
    for word in words:
        if 2 <= len(word) <= 7 and not re.search(r'(.)\1', word):
            filtered.append(word)
    return ' '.join(filtered)
def clean_csv(input_file, output_file):
    df = pd.read_csv(input_file, encoding='utf-8')

    # Clean text columns if they exist
    if 'context' in df.columns:
        df['context'] = df['context'].astype(str).apply(clean_text).apply(remove_punctuation_numbers).apply(filter_words)
    if 'question' in df.columns:
        df['question'] = df['question'].astype(str).apply(clean_text).apply(remove_punctuation_numbers).apply(filter_words)
    if 'text' in df.columns:
        df['text'] = df['text'].astype(str).apply(clean_text).apply(remove_punctuation_numbers).apply(filter_words)

    # Clean ID columns if they exist
    if 'cid' in df.columns:
        df['cid'] = df['cid'].astype(str).apply(clean_id)
    if 'qid' in df.columns:
        df['qid'] = df['qid'].astype(str).apply(clean_id)

    # Save cleaned file
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✅ Cleaned file saved: {output_file}")

clean_csv('/kaggle/input/datanlpnew/train_corpus.csv', '/kaggle/working/corpus_cleaned.csv')
clean_csv('/kaggle/input/datanlpnew/train.csv', '/kaggle/working/train_cleaned.csv')


In [None]:
#Tokenize
import pandas as pd
from underthesea import word_tokenize, pos_tag

# Tokenizer and tagger
def tokenize_text(text):
    return word_tokenize(str(text), format="text")
    
def process_train(train_input, train_output):
    df = pd.read_csv(train_input)
    print("🔄 Tokenizing and tagging 'question'...")
    df["question_tokenized"] = df["question"].astype(str).apply(tokenize_text)
    print("🔄 Tokenizing and tagging 'context'...")
    df["context_tokenized"] = df["context"].astype(str).apply(tokenize_text)
    df.to_csv(train_output, index=False)
    print(f"✅ Tokenized train saved to {train_output}")

def process_corpus(corpus_input, corpus_output):
    df = pd.read_csv(corpus_input)
    print("🔄 Tokenizing and tagging 'context'...")
    df["context_tokenized"] = df["context"].astype(str).apply(tokenize_text)
    df.to_csv(corpus_output, index=False)
    print(f"✅ Tokenized corpus saved to {corpus_output}")
# Run
process_train("/kaggle/working/train_cleaned.csv", "train_tokenized.csv")
process_corpus("/kaggle/working/corpus_cleaned.csv", "corpus_tokenized.csv") 
    

In [None]:
import pandas as pd
import math
from collections import defaultdict
from underthesea import pos_tag

def compute_idf(tokenized_series):
    """Computes BM25-style IDF scores from tokenized text."""
    df_counts = defaultdict(int)
    total_docs = len(tokenized_series)

    for doc in tokenized_series:
        unique_words = set(doc.split())
        for word in unique_words:
            df_counts[word] += 1

    idf_scores = {
        word: math.log((total_docs - df + 0.5) / (df + 0.5) + 1)
        for word, df in df_counts.items()
    }
    return idf_scores

def build_vocab_with_pos(corpus_csv, output_csv):
    df = pd.read_csv(corpus_csv)

    # Compute IDF from 'context_tokenized'
    idf_scores = compute_idf(df["context_tokenized"].astype(str))

    # POS tag only vocabulary words (one word at a time)
    vocab_data = []
    for word, idf in idf_scores.items():
        try:
            tagged = pos_tag(word)
            pos = tagged[0][1] if tagged else ""
        except:
            pos = ""
        vocab_data.append({"word": word, "idf": idf, "pos": pos})

    pd.DataFrame(vocab_data).to_csv(output_csv, index=False)
    print(f"✅ Vocabulary saved to {output_csv}")

# Run
build_vocab_with_pos("/kaggle/working/corpus_tokenized.csv", "vocabulary.csv")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load tokenized train file
df = pd.read_csv("train_tokenized.csv")

# Split with 80:20 ratio
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Save to CSV
train_set.to_csv("train_set.csv", index=False)
test_set.to_csv("test_set.csv", index=False)

print("✅ train_set.csv and test_set.csv created with 80:20 split")
