In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch

# Confirm that the GPU is detected
assert torch.cuda.is_available()

In [None]:
import pandas as pd
import re

file_path = '/content/drive/My Drive/256/Project/dataset.csv'

data = pd.read_csv(file_path, header=None)
data.columns = ['Problem ID', 'Problem Description', 'Rating', 'Tags']

def clean_data(data):
  data['Problem Description'] = data['Problem Description'].str.strip("'\"")
  data['Problem Description'] = data['Problem Description'].str.strip()
  data = data.replace('',pd.NA)
  data = data[data['Problem Description'].str.strip() != '']
  data = data.dropna()
  data["Tags"] = data["Tags"].apply(lambda x: x.split('|'))

  def clean_text(text):
      # Remove URLs
      text = re.sub(r'https?://\S+', '', text)
      # Remove Markdown-style links (e.g., [text](link))
      text = re.sub(r'\[.*?\]\(.*?\)', '', text)
      # Remove Markdown lists or formatting artifacts
      text = re.sub(r'\* ', '', text)
      # Remove extra whitespace, newlines, and special characters
      text = re.sub(r'\s+', ' ', text).strip()
      text = re.sub(r'(\\n|\\r|\\t)+', ' ', text)  # Handle escaped newline artifacts
      # Remove other unwanted symbols (if needed)
      text = re.sub(r'[|\\\n\xa0]', '', text)

      text = re.sub(rf'{re.escape("Copyright 2010-2024")}.*', '', text).strip()
      text = re.sub(rf'.*?{re.escape("CUSTOM TESTn")}', '', text).strip()
      return text

  data['Problem Description'] = data['Problem Description'].apply(clean_text)

  return data

data = clean_data(data)

In [None]:
exploded_tags = data.explode('Tags')

# Count occurrences of each tag
tag_counts = exploded_tags['Tags'].value_counts()
print(tag_counts)

In [None]:
important_tags = ['math','greedy','implementation','dp','data structures','constructive algorithms','brute force','binary search','sortings','graphs']

# Filter rows that have at least one important tag
data = data[data['Tags'].apply(lambda x: bool(set(x) & set(important_tags)))]

# Remove non-important tags from the rows
data['Tags'] = data['Tags'].apply(lambda x: [tag for tag in x if tag in important_tags])

print(data)

In [None]:
for tag in important_tags:
    data[tag] = data['Tags'].apply(lambda tags: tag in tags)
data = data.drop('Tags',axis=1)
data

In [None]:
pip install datasets huggingface_hub

In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(data, train_size=0.8, random_state=42)
val_data, test_data = train_test_split(temp_data, train_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)
hf_dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})
print(len(train_data),len(val_data),len(test_data))
hf_dataset

In [None]:
from huggingface_hub import login

# Replace 'your_token' with your Hugging Face access token
hf_token = ""
login(hf_token)

hf_dataset.push_to_hub("mrfire15/CodeforcesProblems")