In [None]:
!pip install sdv --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/197.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.0/197.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/139.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/14.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/14.3 MB[0m [31m212.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m14.3/14.3 MB[0m [31m223.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m14.3/14.3 MB[0m [31m223.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

df = pd.read_csv("/content/real_0.6.csv")

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)

lens = 3322

In [None]:
def submit(data, subname):
    df = data.copy()

    # ---- Fill missing values ----
    # Fill categorical (object or category) columns with "ADANIPORT"
    for col in df.select_dtypes(include=["object", "category"]).columns:
        df[col] = df[col].fillna("ADANIPORTS")

    # Fill numerical columns with column mean
    for col in df.select_dtypes(include=["number"]).columns:
        df[col] = df[col].fillna(df[col].mean())

    # ---- Continue with your original logic ----
    df = df.sort_values("t").reset_index(drop=True)
    df["row_id_column_name"] = range(1, len(df) + 1)

    if "Series" in df.columns:
        df = df.drop(columns=["Series"])

    submission_cols = [
        "row_id_column_name", "Symbol", "Prev Close", "Open", "High", "Low",
        "Last", "Close", "VWAP", "Volume", "Turnover", "Trades",
        "Deliverable Volume", "%Deliverble", "t"
    ]

    df = df[submission_cols]
    df.to_csv(f"{subname}.csv", index=False)

    print("Done! Matched EXACT submission format.")
    print("Done! Saved as", subname, ".csv file submission done.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_predictions(original_df, synthetic_df):
    """
    Comprehensive evaluation for synthetic data
    """
    results = {}
    numerical_cols = original_df.select_dtypes(include=[np.number]).columns

    for col in numerical_cols:
        if col in synthetic_df.columns:
            # Remove NaNs
            mask = (~original_df[col].isna()) & (~synthetic_df[col].isna())
            orig = original_df[col][mask]
            synth = synthetic_df[col][mask]

            if len(orig) > 10:
                from scipy.stats import wasserstein_distance, ks_2samp

                metrics = {
                    # Basic error metrics
                    'rmse': np.sqrt(mean_squared_error(orig, synth)),
                    'mae': np.mean(np.abs(orig - synth)),

                    # Statistical similarity
                    'wasserstein_distance': wasserstein_distance(orig, synth),
                    'ks_statistic': ks_2samp(orig, synth).statistic,

                    # Distribution preservation
                    'mean_ratio': synth.mean() / orig.mean(),
                    'std_ratio': synth.std() / orig.std(),

                    # Correlation preservation (if you have multiple samples)
                    'r2_score': r2_score(orig, synth)
                }

                results[col] = metrics

    return results

def print_summary(results):
    print("COMPREHENSIVE SYNTHETIC DATA EVALUATION")
    print("=" * 60)

    for col, metrics in results.items():
        print(f"\n{col}:")
        print(f"  Wasserstein Distance: {metrics['wasserstein_distance']:.4f} (lower = better)")
        print(f"  KS Statistic: {metrics['ks_statistic']:.4f} (lower = better)")
        print(f"  Mean Ratio: {metrics['mean_ratio']:.4f} (closer to 1.0 = better)")
        print(f"  Std Ratio: {metrics['std_ratio']:.4f} (closer to 1.0 = better)")
        print(f"  R2 Score: {metrics['r2_score']:.4f}")


In [None]:
def assess_synthetic_quality(results):
    """Quick quality assessment"""
    print("QUALITY ASSESSMENT:")
    print("=" * 40)

    good_columns = 0
    total_columns = len(results)

    for col, metrics in results.items():
        quality_issues = []

        if metrics['wasserstein_distance'] > 0.3:
            quality_issues.append("poor distribution match")
        if abs(metrics['std_ratio'] - 1.0) > 0.2:
            quality_issues.append("variance not preserved")
        if abs(metrics['mean_ratio'] - 1.0) > 0.1:
            quality_issues.append("mean shifted")

        if not quality_issues:
            print(f"✓ {col}: GOOD")
            good_columns += 1
        else:
            print(f"✗ {col}: Issues - {', '.join(quality_issues)}")

    print(f"\nOverall Quality: {good_columns}/{total_columns} columns passed")
    return good_columns / total_columns

# CTGAN Model

In [None]:
def preprocess_for_ctgan(df,
                         categorical_cols=['Symbol','Series'],
                         exclude_cols=['t']):
    df_processed = df.copy()

    # Numerical columns excluding categorical + exclude
    numeric_cols = [
        col for col in df.columns
        if col not in categorical_cols + exclude_cols
        and np.issubdtype(df[col].dtype, np.number)
    ]

    # Store min/max/skew for reversing
    stats = {}

    for col in numeric_cols:
        col_stats = {}

        # Log transform (high skew)
        skew_val = df[col].skew()
        col_stats['log'] = skew_val > 10

        if col_stats['log']:
            df_processed[col] = np.log1p(df_processed[col])

        # Normalize
        min_val = df_processed[col].min()
        max_val = df_processed[col].max()

        if max_val > min_val:
            df_processed[col] = (df_processed[col] - min_val) / (max_val - min_val)

        # Save stats
        col_stats['min'] = min_val
        col_stats['max'] = max_val
        stats[col] = col_stats

    return df_processed, stats

def postprocess_synthetic_data(synthetic_df, stats,
                               categorical_cols=['Symbol','Series']):
    df_out = synthetic_df.copy()

    for col, col_stats in stats.items():
        # clip values to [0,1] if CTGAN overshoots
        df_out[col] = df_out[col].clip(0,1)

        # reverse normalization
        df_out[col] = df_out[col] * (col_stats['max'] - col_stats['min']) + col_stats['min']

        # reverse log transform
        if col_stats['log']:
            df_out[col] = np.expm1(df_out[col])

    return df_out

categorical = ["Symbol", "Series"]

df_prep, stats = preprocess_for_ctgan(df, categorical_cols=categorical, exclude_cols=['t'])

df2 = df_prep.copy()

In [None]:
ctgan = CTGANSynthesizer(
     metadata=metadata,
        epochs=100,
        generator_dim=(512, 512, 512, 512),
        discriminator_dim=(512, 512, 512, 512),
        generator_lr=2e-4,
        discriminator_lr=2e-4,
        pac=10,
        cuda=True,
)
ctgan.fit(df2)

ctgan.save("CTGAN_model.pkl")

df_syn = ctgan.sample(num_rows=lens)

df_syn.to_csv("synthetic_CTGAN.csv", index=False)

print("CTGAN training completed and files saved.")
#submit(df_syn, "CTGAN_submission_100")
results = evaluate_predictions(df2, df_syn.iloc[0:len(df2)])
assess_synthetic_quality(results)



CTGAN training completed and files saved.
QUALITY ASSESSMENT:
✗ Prev Close: Issues - mean shifted
✗ Open: Issues - variance not preserved, mean shifted
✗ High: Issues - variance not preserved
✗ Low: Issues - variance not preserved, mean shifted
✓ Last: GOOD
✗ Close: Issues - mean shifted
✗ VWAP: Issues - variance not preserved, mean shifted
✗ Volume: Issues - poor distribution match, variance not preserved, mean shifted
✗ Turnover: Issues - variance not preserved, mean shifted
✗ Trades: Issues - variance not preserved, mean shifted
✓ Deliverable Volume: GOOD
✗ %Deliverble: Issues - variance not preserved, mean shifted
✗ t: Issues - poor distribution match, mean shifted

Overall Quality: 2/13 columns passed


0.15384615384615385

In [None]:
test = postprocess_synthetic_data(df_syn, stats)
results = evaluate_predictions(df, test.iloc[0:len(df)])
assess_synthetic_quality(results)
submit(test, "CTGAN")

QUALITY ASSESSMENT:
✗ Prev Close: Issues - poor distribution match, mean shifted
✗ Open: Issues - poor distribution match, variance not preserved, mean shifted
✗ High: Issues - poor distribution match, variance not preserved
✗ Low: Issues - poor distribution match, variance not preserved, mean shifted
✗ Last: Issues - poor distribution match
✗ Close: Issues - poor distribution match, mean shifted
✗ VWAP: Issues - poor distribution match, variance not preserved, mean shifted
✗ Volume: Issues - poor distribution match, variance not preserved, mean shifted
✗ Turnover: Issues - poor distribution match, variance not preserved, mean shifted
✗ Trades: Issues - poor distribution match, variance not preserved, mean shifted
✗ Deliverable Volume: Issues - poor distribution match
✗ %Deliverble: Issues - variance not preserved, mean shifted
✗ t: Issues - poor distribution match, mean shifted

Overall Quality: 0/13 columns passed
Done! Matched EXACT submission format.
Done! Saved as CTGAN .csv file 

In [None]:
df['t']

Unnamed: 0,t
0,0
1,2
2,8
3,9
4,10
...,...
1988,4896
1989,4899
1990,4900
1991,4901


In [None]:
def debug_data_issues(original_df, synthetic_df):
    """
    Check what's happening with the data
    """
    print("DEBUG INFO:")
    print("=" * 40)

    numerical_cols = original_df.select_dtypes(include=[np.number]).columns

    for col in numerical_cols:
        if col in synthetic_df.columns:
            orig = original_df[col].dropna()
            synth = synthetic_df[col].dropna()

            print(f"\n{col}:")
            print(f"Original stats: mean={orig.mean():.2f}, std={orig.std():.2f}")
            print(f"Synthetic stats: mean={synth.mean():.2f}, std={synth.std():.2f}")
            print(f"Original range: [{orig.min():.2f}, {orig.max():.2f}]")
            print(f"Synthetic range: [{synth.min():.2f}, {synth.max():.2f}]")

            # Check for NaN/inf
            print(f"Original NaNs: {orig.isna().sum()}, Synthetic NaNs: {synth.isna().sum()}")

# Run debug
debug_data_issues(df, test)

DEBUG INFO:

Prev Close:
Original stats: mean=343.48, std=193.10
Synthetic stats: mean=355.42, std=176.64
Original range: [108.00, 1307.45]
Synthetic range: [108.00, 1307.45]
Original NaNs: 0, Synthetic NaNs: 0

Open:
Original stats: mean=343.95, std=193.71
Synthetic stats: mean=391.33, std=314.28
Original range: [108.00, 1305.00]
Synthetic range: [108.00, 1305.00]
Original NaNs: 0, Synthetic NaNs: 0

High:
Original stats: mean=350.95, std=198.65
Synthetic stats: mean=354.99, std=208.74
Original range: [110.45, 1319.00]
Synthetic range: [110.45, 1319.00]
Original NaNs: 0, Synthetic NaNs: 0

Low:
Original stats: mean=336.76, std=188.80
Synthetic stats: mean=451.68, std=199.46
Original range: [105.65, 1263.70]
Synthetic range: [105.65, 1263.70]
Original NaNs: 0, Synthetic NaNs: 0

Last:
Original stats: mean=343.69, std=193.47
Synthetic stats: mean=460.01, std=233.51
Original range: [108.00, 1308.00]
Synthetic range: [108.00, 1308.00]
Original NaNs: 0, Synthetic NaNs: 0

Close:
Original s

# TVAE - TRAIN & SAVE

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer

tvae = TVAESynthesizer(metadata, epochs=100)
tvae.fit(df)

tvae.save("TVAE_model.pkl")

df_syn2 = tvae.sample(num_rows=len(df))
results = evaluate_predictions(df, df_syn2.iloc[0:len(df)])
assess_synthetic_quality(results)
#submit(df_syn2, "TVAE_submission_50s")



QUALITY ASSESSMENT:
✗ Prev Close: Issues - poor distribution match, variance not preserved
✗ Open: Issues - poor distribution match, variance not preserved
✗ High: Issues - poor distribution match
✗ Low: Issues - poor distribution match, variance not preserved
✗ Last: Issues - poor distribution match, variance not preserved
✗ Close: Issues - poor distribution match
✗ VWAP: Issues - poor distribution match, variance not preserved
✗ Volume: Issues - poor distribution match, variance not preserved, mean shifted
✗ Turnover: Issues - poor distribution match, variance not preserved, mean shifted
✗ Trades: Issues - poor distribution match, variance not preserved
✗ Deliverable Volume: Issues - poor distribution match, variance not preserved, mean shifted
✓ %Deliverble: GOOD
✗ t: Issues - poor distribution match

Overall Quality: 1/13 columns passed


0.07692307692307693

# Gaussian Copla

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer


gc = GaussianCopulaSynthesizer(
    metadata
    )
gc.fit(df)

gc.save("GaussianCopula_model.pkl")

df_syn3 = gc.sample(num_rows=lens)
results = evaluate_predictions(df, df_syn3.iloc[0:len(df)])
assess_synthetic_quality(results)
#submit(df_syn3, "Gaussian_submission")



QUALITY ASSESSMENT:
✗ Prev Close: Issues - poor distribution match
✗ Open: Issues - poor distribution match
✗ High: Issues - poor distribution match
✗ Low: Issues - poor distribution match
✗ Last: Issues - poor distribution match
✗ Close: Issues - poor distribution match
✗ VWAP: Issues - poor distribution match
✗ Volume: Issues - poor distribution match, variance not preserved, mean shifted
✗ Turnover: Issues - poor distribution match, variance not preserved, mean shifted
✗ Trades: Issues - poor distribution match, variance not preserved
✗ Deliverable Volume: Issues - poor distribution match
✓ %Deliverble: GOOD
✗ t: Issues - poor distribution match

Overall Quality: 1/13 columns passed


0.07692307692307693

# CopulaGAN

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CopulaGANSynthesizer

cgan = CopulaGANSynthesizer(
    metadata, epochs=200,verbose=True,
    )
cgan.fit(df)

cgan.save("CopulaGAN_model.pkl")

# Generate synthetic data
df_syn4 = cgan.sample(num_rows=lens)

results = evaluate_predictions(df, df_syn4.iloc[0:len(df)])
assess_synthetic_quality(results)
#submit(df_syn4, "CopulaGaussian_submission_100")

Gen. (-0.08) | Discrim. (-0.55): 100%|██████████| 200/200 [01:04<00:00,  3.08it/s]


QUALITY ASSESSMENT:
✗ Prev Close: Issues - poor distribution match, variance not preserved, mean shifted
✗ Open: Issues - poor distribution match
✗ High: Issues - poor distribution match
✗ Low: Issues - poor distribution match
✗ Last: Issues - poor distribution match, mean shifted
✗ Close: Issues - poor distribution match, variance not preserved, mean shifted
✗ VWAP: Issues - poor distribution match
✗ Volume: Issues - poor distribution match, variance not preserved, mean shifted
✗ Turnover: Issues - poor distribution match, variance not preserved, mean shifted
✗ Trades: Issues - poor distribution match
✗ Deliverable Volume: Issues - poor distribution match, variance not preserved, mean shifted
✗ %Deliverble: Issues - mean shifted
✗ t: Issues - poor distribution match, variance not preserved

Overall Quality: 0/13 columns passed


0.0

# Evaluation Metrics