In [1]:
import pandas as pd
import re
import os

# File path
data_folder = r"C:\Users\asus\Downloads\health api"
file_path = os.path.join(data_folder, 'note.csv.gz')

# Load the notes file
nlp_df = pd.read_csv(file_path, compression='gzip')
print(f"Loaded file shape: {nlp_df.shape}")

# Combine text columns into a single field (adjust column names if needed)
nlp_df['combined_text'] = nlp_df[['notetext', 'notevalue']].astype(str).agg(' '.join, axis=1)

# Text cleaning function
def clean_text(text):
    text = text.lower()                          # lowercase
    text = re.sub(r'\d+', '', text)             # remove numbers
    text = re.sub(r'[^\w\s]', '', text)         # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # remove extra spaces
    return text

# Apply cleaning
nlp_df['cleaned_text'] = nlp_df['combined_text'].apply(clean_text)

# Check result
print(nlp_df[['cleaned_text']].head())


Loaded file shape: (24758, 8)
                                        cleaned_text
0  include past medical history include past medi...
1                             systemview system view
2                              include rx include rx
3                                             copies
4                             systemview system view


In [2]:
# Create FAQ Pairs (Optional)
# If your text already contains questions and answers, extract them.
# If not, you can treat each cleaned note as a separate input to translate

# For simplicity, use each note as one input
faq_inputs = nlp_df['cleaned_text'].tolist()
print(f"Total entries for translation: {len(faq_inputs)}")


Total entries for translation: 24758


In [3]:
!pip install sentencepiece




Tokenization / Model Input

In [4]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-hi'  # English to Hindi
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenize
inputs = tokenizer(faq_inputs[:5], return_tensors="pt", padding=True, truncation=True)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

Translation

In [5]:
translated = model.generate(**inputs)
translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

for en, hi in zip(faq_inputs[:5], translated_texts):
    print(f"EN: {en}\nHI: {hi}\n")


EN: include past medical history include past medical history
HI: चिकित्सा क्षेत्र में पिछले इतिहास में चिकित्सा क्षेत्र शामिल है

EN: systemview system view
HI: तंत्र दृश्य

EN: include rx include rx
HI: आरएक्स मेक्स शामिल करें (x)

EN: copies
HI: प्रतियों की नक़ल करें

EN: systemview system view
HI: तंत्र दृश्य

