In [1]:
!pip install pandas numpy scikit-learn imbalanced-learn




In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings('ignore')

print("SAMPLING ASSIGNMENT\n")

#LOAD DATA
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

print("\nOriginal Class Distribution:")
print(df['Class'].value_counts())

#BALANCE DATA
X = df.drop('Class', axis=1)
y = df['Class']

ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X, y)

X_bal_df = pd.DataFrame(X_balanced)
y_bal_series = pd.Series(y_balanced)

print("\nBalanced Class Distribution:")
print(y_bal_series.value_counts())

#SAMPLE SIZE
Z = 1.96
p = 0.5
E = 0.05
n = int((Z**2 * p * (1-p)) / E**2)

sample_size = min(n, len(X_bal_df))
print("\nSample Size:", sample_size)

dataset = pd.concat([X_bal_df, y_bal_series.rename("Class")], axis=1)

samples = {}

# 1. Simple Random Sampling
samples["Sample1"] = dataset.sample(n=sample_size, random_state=42)

# 2. Systematic Sampling
step = max(1, len(dataset)//sample_size)
samples["Sample2"] = dataset.iloc[::step].head(sample_size)

# 3. Stratified Sampling
_, stratified = train_test_split(dataset,
                                 test_size=sample_size/len(dataset),
                                 stratify=dataset["Class"],
                                 random_state=42)
samples["Sample3"] = stratified

# 4. Cluster Sampling (true cluster sampling)
dataset["Cluster"] = pd.cut(dataset.index, bins=10, labels=False)
chosen_clusters = np.random.choice(dataset["Cluster"].unique(), size=3, replace=False)
cluster_sample = dataset[dataset["Cluster"].isin(chosen_clusters)]
samples["Sample4"] = cluster_sample.drop("Cluster", axis=1).sample(n=sample_size, random_state=42)

# 5. Bootstrap Sampling
samples["Sample5"] = dataset.sample(n=sample_size, replace=True, random_state=42)

print("\nSamples Created Successfully")

#MODELS
models = {
    "M1": LogisticRegression(max_iter=1000, solver='liblinear'),
    "M2": DecisionTreeClassifier(max_depth=10, random_state=42),
    "M3": RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42),
    "M4": SVC(kernel='linear', random_state=42),
    "M5": KNeighborsClassifier(n_neighbors=5)
}

#TRAIN & EVALUATE
results = {}

for sample_name, sample_data in samples.items():
    X_s = sample_data.drop("Class", axis=1)
    y_s = sample_data["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.3, random_state=42, stratify=y_s
    )

    results[sample_name] = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = round(accuracy_score(y_test, y_pred)*100, 2)
        results[sample_name][model_name] = acc
        print(f"{sample_name} - {model_name}: {acc}%")

#RESULT TABLE
results_df = pd.DataFrame(results).T
print("FINAL ACCURACY TABLE (%)")
print(results_df)

#BEST COMBINATIONS
print("\nBEST SAMPLING FOR EACH MODEL")
for model in results_df.columns:
    best_sample = results_df[model].idxmax()
    print(f"{model} -> {best_sample} ({results_df[model].max()}%)")

print("\nBEST MODEL FOR EACH SAMPLING")
for sample in results_df.index:
    best_model = results_df.loc[sample].idxmax()
    print(f"{sample} -> {best_model} ({results_df.loc[sample].max()}%)")

# Save CSV
results_df.to_csv("sampling_results.csv")
print("\nResults saved as sampling_results.csv")

print("\nASSIGNMENT COMPLETED SUCCESSFULLY")


SAMPLING ASSIGNMENT


Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64

Balanced Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64

Sample Size: 384

Samples Created Successfully
Sample1 - M1: 87.07%
Sample1 - M2: 96.55%
Sample1 - M3: 100.0%
Sample1 - M4: 87.07%
Sample1 - M5: 90.52%
Sample2 - M1: 87.07%
Sample2 - M2: 97.41%
Sample2 - M3: 100.0%
Sample2 - M4: 85.34%
Sample2 - M5: 95.69%
Sample3 - M1: 92.24%
Sample3 - M2: 98.28%
Sample3 - M3: 100.0%
Sample3 - M4: 93.1%
Sample3 - M5: 93.97%
Sample4 - M1: 92.24%
Sample4 - M2: 96.55%
Sample4 - M3: 98.28%
Sample4 - M4: 94.83%
Sample4 - M5: 96.55%
Sample5 - M1: 98.28%
Sample5 - M2: 97.41%
Sample5 - M3: 98.28%
Sample5 - M4: 98.28%
Sample5 - M5: 96.55%
FINAL ACCURACY TABLE (%)
            M1     M2      M3     M4     M5
Sample1  87.07  96.55  100.00  87.07  90.52
Sample2  87.07  97.41  100.00  85.34  95.69
Sample3  92.24  98.28  100.00  93.10  93.97
Sample4  92.24  96.55   98.28  94.83  96.55
Sa