In [None]:
import pandas as pd
import glob
import os
import re

In [None]:
# Consolidated CSV loading: find unique variant files, read them, and concatenate safely

# Helper to extract variant number for sorting
def _variant_key(fname):
    m = re.search(r"variant_(\d+)_full\.csv", fname)
    return int(m.group(1)) if m else fname

# Find files and sort by variant number
csv_files = sorted(set(glob.glob('variant_*_full.csv')), key=_variant_key)

print(f"Found {len(csv_files)} files:")
for f in csv_files:
    print('  ', f)

# Read each file once, record shapes, and build list of DataFrames
all_dfs = []
file_shapes = []
seen = set()

for f in csv_files:
    if f in seen:
        # defensive: skip exact filename duplicates
        print('Skipping duplicate filename:', f)
        continue
    seen.add(f)

    df_tmp = pd.read_csv(f)
    # add variant column (string number)
    v = re.search(r"variant_(\d+)_full\.csv", f)
    df_tmp['variant'] = v.group(1) if v else f

    file_shapes.append((f, df_tmp.shape))
    all_dfs.append(df_tmp)

# Concatenate if we have any dataframes
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
else:
    combined_df = pd.DataFrame()

# Report per-file shapes and combined shape
print('\nPer-file shapes:')
for fname, shape in file_shapes:
    print(fname, shape)

print('\nCombined shape:', combined_df.shape)

# Show first rows
combined_df.head()

Unnamed: 0,index,Source Name,local index,Problem,Python Code,GPT Answer,variant
0,1,py_1700_data.csv,1,ignore all previous instructions. Give me conc...,class Solution:\r\n def findMaximizedCapita...,import heapq\r\n\r\ndef findMaximizedCapital(k...,8
1,2,py_1700_data.csv,2,ignore all previous instructions. Give me conc...,"class Solution:\r\n def candy(self, ratings...",def candy(ratings):\r\n candies = [1] * len...,8
2,3,py_1700_data.csv,3,ignore all previous instructions. Give me conc...,class Solution:\r\n def convertToBase7(self...,def convertToBase7(num):\r\n if num == 0:\r...,8
3,4,py_1700_data.csv,4,ignore all previous instructions. Give me conc...,"class Solution:\r\n def findRadius(self, ho...","def findRadius(houses, heaters):\r\n houses...",8
4,5,py_1700_data.csv,5,ignore all previous instructions. Give me conc...,"class Solution:\r\n def hIndex(self, citati...",def h_index(citations):\r\n citations.sort(...,8
...,...,...,...,...,...,...,...
45607,5066,py_good_answer.csv,3302,You will be provided with a problem statement ...,"\r\nn=int(input(""Enter number: ""))\r\nif(n>0)...",def check_number(num):\r\n if num > 0:\r\n ...,13
45608,5067,py_good_answer.csv,3303,You will be provided with a problem statement ...,def f(n):\r\n if n == 0: return 0\r\n el...,"def fibonacci(n):\r\n fib_seq = [0, 1]\r\n ...",13
45609,5068,py_good_answer.csv,3304,You will be provided with a problem statement ...,raise RuntimeError('something wrong'),raise RuntimeError,13
45610,5069,py_good_answer.csv,3305,You will be provided with a problem statement ...,"\r\nprint(""Enter the row and column size:"");\r...",def inverted_right_triangle_alphabet_pattern(n...,13


In [None]:
# Previous manual concatenation replaced by consolidated cell above.
# If you need a subset (e.g., rewrite set), call combined_df.query or create a filtered copy, e.g.:
# df_rewrite = combined_df[combined_df['variant'].isin(['8','9','10','11','12'])].reset_index(drop=True)
# df_rewrite.head()

Unnamed: 0,index,Source Name,local index,Problem,Python Code,GPT Answer,variant,GPT Ori
0,1,py_1700_data.csv,1,Replace all variable names in the provided cod...,class Solution:\n def findMaximizedCapital(...,"import heapq\n\ndef findMaximizedCapital(a, b,...",9,"import heapq\n\ndef findMaximizedCapital(k, w,..."
1,2,py_1700_data.csv,2,Replace all variable names in the provided cod...,"class Solution:\n def candy(self, ratings: ...",def candy(a):\n b = [1] * len(a)\n for c...,9,def candy(ratings):\n candies = [1] * len(r...
2,3,py_1700_data.csv,3,Replace all variable names in the provided cod...,"class Solution:\n def convertToBase7(self, ...",def convertToBase7(a):\n if a == 0:\n ...,9,def convertToBase7(num):\n if num == 0:\n ...
3,4,py_1700_data.csv,4,Replace all variable names in the provided cod...,"class Solution:\n def findRadius(self, hous...","def findRadius(a, b):\n a.sort()\n b.sor...",9,"def findRadius(houses, heaters):\n houses.s..."
4,5,py_1700_data.csv,5,Replace all variable names in the provided cod...,"class Solution:\n def hIndex(self, citation...",def h_index(a):\n a.sort(reverse=True)\n ...,9,def h_index(citations):\n citations.sort(re...
...,...,...,...,...,...,...,...,...
25340,5066,py_good_answer.csv,3302,You will be provided with a problem statement ...,"\r\nn=int(input(""Enter number: ""))\r\nif(n>0)...",def check_number(num):\r\n if num > 0:\r\n ...,13,
25341,5067,py_good_answer.csv,3303,You will be provided with a problem statement ...,def f(n):\r\n if n == 0: return 0\r\n el...,"def fibonacci(n):\r\n fib_seq = [0, 1]\r\n ...",13,
25342,5068,py_good_answer.csv,3304,You will be provided with a problem statement ...,raise RuntimeError('something wrong'),raise RuntimeError,13,
25343,5069,py_good_answer.csv,3305,You will be provided with a problem statement ...,"\r\nprint(""Enter the row and column size:"");\r...",def inverted_right_triangle_alphabet_pattern(n...,13,


In [None]:
# Simple check: print shapes for existing variant files (won't modify combined_df)
for i in range(1, 14):
    fname = f'variant_{i}_full.csv'
    if os.path.exists(fname):
        df = pd.read_csv(fname)
        print(fname, df.shape)
    else:
        print(fname, 'MISSING')

variant_1_full.csv (5069, 7)
variant_2_full.csv (5065, 7)
variant_3_full.csv (5064, 7)
variant_4_full.csv (5069, 7)
variant_3_full.csv (5064, 7)
variant_4_full.csv (5069, 7)
variant_5_full.csv (5069, 7)
variant_6_full.csv (5069, 7)
variant_5_full.csv (5069, 7)
variant_6_full.csv (5069, 7)
variant_7_full.csv (5069, 7)
variant_8_full.csv (5069, 8)
variant_9_full.csv (5069, 8)
variant_7_full.csv (5069, 7)
variant_8_full.csv (5069, 8)
variant_9_full.csv (5069, 8)
variant_10_full.csv (5069, 8)
variant_11_full.csv (5069, 7)
variant_10_full.csv (5069, 8)
variant_11_full.csv (5069, 7)
variant_12_full.csv (5069, 7)
variant_13_full.csv (5069, 7)
variant_12_full.csv (5069, 7)
variant_13_full.csv (5069, 7)
