In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

X = data.drop(columns=['Class'])
y = data['Class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                         pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

confidence_level = 0.95
margin_of_error = 0.05
p_hat = y_resampled.mean()
z_score = 1.96

random_sample_size = int((z_score * 2 * p_hat * (1 - p_hat)) / (margin_of_error * 2))

strata_variance = balanced_df['Class'].value_counts(normalize=True).std()
if strata_variance == 0:
    strata_variance = 1
stratified_sample_size = int((z_score * 2 * p_hat * (1 - p_hat)) / ((margin_of_error / strata_variance) * 2))

num_clusters = 5
cluster_sample_size = int((z_score * 2 * p_hat * (1 - p_hat)) / ((margin_of_error / num_clusters) * 2))

sampling_methods = {}

sampling_methods['Random'] = balanced_df.sample(n=random_sample_size, random_state=42)

sampling_methods['Stratified'] = balanced_df.groupby('Class').apply(
    lambda group: group.sample(
        int(stratified_sample_size * len(group) / len(balanced_df)),
        replace=True,
        random_state=42
    )
).reset_index(drop=True)

step_size = len(balanced_df) // random_sample_size
sampling_methods['Systematic'] = balanced_df.iloc[::step_size, :].reset_index(drop=True)

balanced_df['Cluster'] = pd.cut(balanced_df['Time'], bins=num_clusters, labels=False)
sampled_clusters = balanced_df['Cluster'].sample(num_clusters // 2, random_state=42).unique()
sampling_methods['Cluster'] = balanced_df[balanced_df['Cluster'].isin(sampled_clusters)].reset_index(drop=True)

sampling_methods['Bootstrap'] = balanced_df.sample(n=random_sample_size, replace=True, random_state=42)

models = {
    'LogReg': LogisticRegression(max_iter=1000),
    'DecTree': DecisionTreeClassifier(random_state=42),
    'RandForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'GradBoost': GradientBoostingClassifier(random_state=42),
}

model_performance = {}
for method, data_sample in sampling_methods.items():
    X_sample = data_sample.drop(columns=['Class', 'Cluster'], errors='ignore')
    y_sample = data_sample['Class']
    
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        accuracy = model.score(X_test, y_test)
        
        if method not in model_performance:
            model_performance[method] = {}
        model_performance[method][model_name] = accuracy

for method, scores in model_performance.items():
    for model_name, score in scores.items():
        print(f"Sampling Method: {method} | Model: {model_name} | Accuracy: {score:.2f}")

# Identifying the best sampling method and model
best_sampling_method = ""
best_model_name = ""
best_accuracy = 0

for method, scores in model_performance.items():
    for model_name, score in scores.items():
        if score > best_accuracy:
            best_accuracy, best_sampling_method, best_model_name = score, method, model_name

print("\nBest Sampling Technique and Model:")
print(f"Sampling Method: {best_sampling_method}")
print(f"Model: {best_model_name}")
print(f"Accuracy: {best_accuracy:.2f}")


  sampling_methods['Stratified'] = balanced_df.groupby('Class').apply(


Sampling Method: Random | Model: LogReg | Accuracy: 0.50
Sampling Method: Random | Model: DecTree | Accuracy: 1.00
Sampling Method: Random | Model: RandForest | Accuracy: 0.50
Sampling Method: Random | Model: XGBoost | Accuracy: 0.50
Sampling Method: Random | Model: GradBoost | Accuracy: 0.50
Sampling Method: Stratified | Model: LogReg | Accuracy: 0.50
Sampling Method: Stratified | Model: DecTree | Accuracy: 0.50
Sampling Method: Stratified | Model: RandForest | Accuracy: 0.50
Sampling Method: Stratified | Model: XGBoost | Accuracy: 0.50
Sampling Method: Stratified | Model: GradBoost | Accuracy: 0.50
Sampling Method: Systematic | Model: LogReg | Accuracy: 1.00
Sampling Method: Systematic | Model: DecTree | Accuracy: 1.00
Sampling Method: Systematic | Model: RandForest | Accuracy: 1.00
Sampling Method: Systematic | Model: XGBoost | Accuracy: 1.00
Sampling Method: Systematic | Model: GradBoost | Accuracy: 1.00
Sampling Method: Cluster | Model: LogReg | Accuracy: 0.93
Sampling Method: Clu