In [None]:
!pip install openai transformers torch pandas scikit-learn


In [None]:
import openai
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# **Step 1: Dataset Generation with OpenAI API**

# Set your OpenAI API key
openai.api_key = 'your_openai_api_key_here'

# Function to generate informal dates using OpenAI's API
def generate_informal_dates(prompts):
    informal_dates = []
    for prompt in prompts:
        response = openai.Completion.create(
            engine="text-davinci-003",  # Use the GPT-3.5 model for generating text
            prompt=prompt,
            max_tokens=30,
            n=1,
            stop=None,
            temperature=0.5  # Adjust temperature for more or less randomness
        )
        informal_dates.append(response['choices'][0]['text'].strip())
    return informal_dates

# Predefined prompts for generating informal date variations
prompts = [
    "Generate informal date for: 1403/07/01",
    "Generate informal date for: 1403/07/01",
    "Generate informal date for: 1403/08/05",
    "Generate informal date for: 1403/09/25",
    "Generate informal date for: 1403/04/30",
    "Generate informal date for: 1403/01/01",
    "Generate informal date for: 1403/10/14",
    "Generate informal date for: 1403/11/08"
]

# Generate informal dates from OpenAI
informal_dates = generate_informal_dates(prompts)

# Corresponding formal formats (known correct dates)
formal_dates = [
    "1403/07/01",  # Formal date
    "1403/07/01",  # Formal date
    "1403/08/05",  # Formal date with day
    "1403/09/25",  # Formal date with day
    "1403/04/30",  # Formal date with day
    "1403/01/01",  # Formal date
    "1403/10/14",  # Formal date with day
    "1403/11/08"   # Formal date with day
]

# Create a DataFrame to store the generated informal dates and their formal counterparts
data = {
    "informal_date": informal_dates,
    "formal_date": formal_dates
}
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv('date_conversion_dataset.csv', index=False)
print(df)

# **Step 2: Data Preparation**
# Load the dataset
df = pd.read_csv('date_conversion_dataset.csv')

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Load the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Tokenize the data
train_encodings = tokenizer(train_data['informal_date'].tolist(), truncation=True, padding=True, return_tensors='pt')
train_labels = tokenizer(train_data['formal_date'].tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_data['informal_date'].tolist(), truncation=True, padding=True, return_tensors='pt')
test_labels = tokenizer(test_data['formal_date'].tolist(), truncation=True, padding=True, return_tensors='pt')

# **Step 3: Model Training**
# Create data loaders
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_labels['input_ids'])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Training settings
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()

# Train the model
for epoch in range(5):  # Increase the number of epochs for better learning
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, labels = batch
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch}, Loss: {loss.item()}')

# **Step 4: Model Evaluation**
# Evaluate the model
model.eval()
test_encodings = test_encodings['input_ids']
with torch.no_grad():
    outputs = model.generate(test_encodings)

# Decode tokens to text
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Function to ensure correct year, month, and format in predictions
def correct_date_format(predicted, informal):
    # Define month keywords and their corresponding numbers
    month_keywords = {
        "Farvardin": "01",
        "Ordibehesht": "02",
        "Khordad": "03",
        "Tir": "04",
        "Mordad": "05",
        "Shahrivar": "06",
        "Mehr": "07",
        "Aban": "08",
        "Azar": "09",
        "Dey": "10",
        "Bahman": "11",
        "Esfand": "12"
    }
    # Extract year from the informal date
    year = None
    for part in informal.split():
        if part.isdigit() and len(part) == 4:  # Check if it's a year
            year = part
    # Find the month from informal input
    month_number = None
    for month_name, month_num in month_keywords.items():
        if month_name in informal:
            month_number = month_num
            break
    # Extract day from the predicted output
    parts = predicted.split('/')
    if len(parts) == 3:
        day = parts[2]  # Use the day part from predicted
    else:
        day = '01'  # Default day if not found
    # If year is found, construct the final date
    if year and month_number:
        return f"{year}/{month_number}/{day.zfill(2)}"
    return predicted  # Return the original if no month or year is found

# Display results
for informal, formal, decoded in zip(test_data['informal_date'], test_data['formal_date'], decoded_outputs):
    corrected_decoded = correct_date_format(decoded, informal)
    print(f'Informal: {informal}, Expected: {formal}, Predicted: {corrected_decoded}')
