In [1]:
# ============================================
# Sampling Assignment â€“ FINAL FULL CODE
# ============================================

# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import resample
from sklearn.cluster import KMeans


# 2. LOAD DATASET
data = pd.read_csv(
    "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
)

X = data.drop("Class", axis=1)
y = data["Class"]


# 3. BALANCE DATASET
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

balanced_df = pd.concat([X_bal, y_bal], axis=1)


# 4. SAMPLING FUNCTIONS
def simple_random_sampling(df, frac=0.3, seed=42):
    return df.sample(frac=frac, random_state=seed)

def stratified_sampling(df, frac=0.3, seed=42):
    return df.groupby("Class", group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=seed)
    )

def systematic_sampling(df, step=3):
    return df.iloc[::step]

def cluster_sampling(df, n_clusters=10, clusters_to_pick=3, seed=42):
    features = df.drop("Class", axis=1)
    kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
    df = df.copy()
    df["cluster"] = kmeans.fit_predict(features)

    chosen_clusters = np.random.RandomState(seed).choice(
        n_clusters, clusters_to_pick, replace=False
    )
    sampled_df = df[df["cluster"].isin(chosen_clusters)]
    return sampled_df.drop("cluster", axis=1)

def bootstrap_sampling(df, n_samples, seed=42):
    return df.sample(n=n_samples, replace=True, random_state=seed)


# 5. CREATE SAMPLES USING REQUIRED TECHNIQUES
samples = {
    "Sampling1_SimpleRandom": simple_random_sampling(balanced_df, frac=0.3, seed=42),
    "Sampling2_Stratified": stratified_sampling(balanced_df, frac=0.3, seed=42),
    "Sampling3_Systematic": systematic_sampling(balanced_df, step=3),
    "Sampling4_Cluster": cluster_sampling(balanced_df, n_clusters=10, clusters_to_pick=3, seed=42),
    "Sampling5_Bootstrap": bootstrap_sampling(
        balanced_df, n_samples=int(0.3 * len(balanced_df)), seed=42
    )
}


# 6. MACHINE LEARNING MODELS
models = {
    "M1": LogisticRegression(max_iter=5000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": GaussianNB(),
    "M5": SVC()
}


# 7. APPLY ALL SAMPLINGS ON ALL MODELS
results = pd.DataFrame(
    index=models.keys(),
    columns=samples.keys()
)

for samp_key, samp_df in samples.items():

    X = samp_df.drop("Class", axis=1)
    y = samp_df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    for model_key, model in models.items():

        if model_key in ["M1", "M5"]:
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred) * 100
        results.loc[model_key, samp_key] = round(acc, 2)


# 8. DISPLAY RESULTS
print("Accuracy Table:")
print(results)


# 9. BEST SAMPLING PER MODEL
best_sampling = results.astype(float).idxmax(axis=1)
print("\nBest Sampling Technique for Each Model:")
print(best_sampling)


  return df.groupby("Class", group_keys=False).apply(


Accuracy Table:
   Sampling1_SimpleRandom Sampling2_Stratified Sampling3_Systematic  \
M1                  89.86                92.03                88.89   
M2                  96.38                97.83                98.69   
M3                  97.83                100.0                100.0   
M4                  67.39                70.29                 71.9   
M5                  94.93                99.28                95.42   

   Sampling4_Cluster Sampling5_Bootstrap  
M1             96.83               92.03  
M2             99.21               97.83  
M3             100.0               100.0  
M4             100.0               77.54  
M5             99.21                97.1  

Best Sampling Technique for Each Model:
M1       Sampling4_Cluster
M2       Sampling4_Cluster
M3    Sampling2_Stratified
M4       Sampling4_Cluster
M5    Sampling2_Stratified
dtype: object
