# Imports

In [None]:
import requests
import pandas as pd
import re
from tqdm import tqdm
import os
import csv


# Leet Code Scrapping

## Solved problems

In [None]:
!git clone https://github.com/JuliaCN/LeetCode.jl

In [None]:
# Specify the directory containing the files
directory = "/content/LeetCode.jl/src/problems"

# Create or open the CSV file
csv_file_path = "Julia_leet.csv"
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['statement', 'solution']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    # Iterate through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".jl"):
            file_path = os.path.join(directory, filename)

            # Read the contents of the file
            with open(file_path, encoding='utf-8') as file:
              content = file.read()
              title_match = re.search(r'title:\s*([^#]+)', content)
              title = title_match.group(1) if title_match else ""
              hidden_index = content.find("hidden:")
              if hidden_index != -1:
                  content = content[hidden_index+12:].strip()
              content = title + content
            # Split the content into statement and solution
              parts = content.split("## @lc code=start")
              if len(parts) == 2:
                  statement = parts[0].strip().replace("## @lc code=start", "").replace("#", "").strip()
                  solution = parts[1].strip().replace("## @lc code=end", "").replace("using LeetCode", "").strip()

                  # Write to the CSV file
                  writer.writerow({'statement': statement, 'solution': solution})

print(f"CSV file '{csv_file_path}' has been created successfully.")


## Test cases
This section might have errors while running

Some cases must be adressed specifically



In [None]:
def extract_test_cases(s):
    pattern = r'@testset "(?:\d+\.)(.*?)" begin(.*?)end'
    matches = re.findall(pattern, s, re.DOTALL)
    results = []
    for match in matches:
        title = match[0][:len(match[0])-3]
        test_cases = re.findall(r'@test\s(.*?)==\s(.*?)\n', match[1])
        results.append({'title': title, 'test_cases': test_cases})
    return results

In [None]:
dir = "/content/LeetCode.jl/test/problems"
# Create or open the CSV file
csv_file_path = "Julia_leet_test.csv"
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['title', 'test_case', 'expected_solution']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for filename in os.listdir(dir):
        if filename.endswith(".jl"):
            file_path = os.path.join(dir, filename)
            with open(file_path, encoding='utf-8') as file:
                content = file.read()
                results = extract_test_cases(content)

                for :
                    writer.writerow({'title': , 'test_case': , 'expected_solution': })

print(f"CSV file '{csv_file_path}' has been created successfully.")



In [None]:
# Test cases
s1 = '''@testset "10.regular-expression-matching.jl" begin
    @test is_match("aa", "a") == false
    @test is_match("aa", "a*") == true
    @test is_match("ab", ".*") == true
    @test is_match("aab", "c*a*b") == true
    @test is_match("mississippi", "mis*is*p*.") == false
    @test is_match("aab", "c.") == false
    @test is_match("", "c") == false
end'''

s2 = '''@testset "1.two-sum.jl" begin
    @test two_sum([2, 7, 11, 15], 9) == (1, 2)
    @test two_sum([3, 2, 4], 6) == (2, 3)
    @test two_sum([3, 3], 6) == (1, 2)
end'''

results1 = extract_test_cases(s1)
print("Test set 1:")
for result in results1:
    print("Title:", result['title'])
    print("Test cases:")
    for case in result['test_cases']:
        print(case)

results2 = extract_test_cases(s2)
print("\nTest set 2:")
for result in results2:
    print("Title:", result['title'])
    print("Test cases:")
    for case in result['test_cases']:
        print(case)


In [None]:
results1

# Data Augmentation

In [None]:
API_URL = "https://api-inference.huggingface.co/models/philschmid/bart-large-cnn-samsum"
headers = {"Authorization": "Bearer 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"}

In [None]:
def summarize(input) :
  API_URL = "https://api-inference.huggingface.co/models/philschmid/bart-large-cnn-samsum"
  headers = {"Authorization": "Bearer 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"}

  def query(payload):
	  response = requests.post(API_URL, headers=headers, json=payload)
	  return response.json()



  output = query({
    "inputs" : input
  })
  return output[0]['summary_text']

In [None]:
# Read the CSV file
df = pd.read_csv('Julia_leet.csv')

# Extract statements and process them
processed_statements = []
for statement in tqdm(df['statement']):
    # Find the first occurrence of the word "example" (case insensitive)
    match = re.search(r'\bexample\b', statement, flags=re.IGNORECASE)
    if match:
        # Extract the part of the statement before the match
        processed_statement = statement[:match.start()].strip()
    else:
        # If "example" is not found, use the whole statement
        processed_statement = statement.strip()
    # Summarize the processed statement
    summarized_statement = summarize(processed_statement)
    processed_statements.append(summarized_statement)

# Add the summarized statements to the DataFrame
df['summarized_statement'] = processed_statements

# Write the DataFrame to a new CSV file
df.to_csv('summarized_data.csv', index=False)