In [4]:
import pandas as pd
import math
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

# Check for missing values
print("Missing Values in Dataset:")
print(data.isnull().sum())

# Handle missing values
# For numerical columns, fill with the mean
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# For categorical columns, fill with the mode
categorical_columns = data.select_dtypes(include=["object"]).columns
if len(categorical_columns) > 0:
    data[categorical_columns] = data[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]))

# Verify no missing values remain
print("\nMissing Values After Handling:")
print(data.isnull().sum())

# Debugging: Display the column names of the dataset
print("\nDataset Columns:", data.columns)

# Step 2: Balance the dataset using SMOTE
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target

# Debugging: Check the structure of X and y
print("\nFeature Columns (X):", X.columns)
print("Target Column (y):", y.name)

# Ensure target has no NaN values
if y.isnull().sum() > 0:
    y = y.fillna(y.mode()[0])

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Ensure no NaN values in the balanced target
y_balanced = pd.Series(y_balanced, name="Target").fillna(y.mode()[0])

# Create a balanced dataset
balanced_data = pd.concat([pd.DataFrame(X_balanced, columns=X.columns), y_balanced], axis=1)

# Debugging: Check the structure of the balanced dataset
print("\nBalanced Dataset Columns:", balanced_data.columns)

# Re-identify numerical and categorical columns in balanced data
numerical_columns = balanced_data.select_dtypes(include=["float64", "int64"]).columns
categorical_columns = balanced_data.select_dtypes(include=["object"]).columns

# Check and handle NaN values in the balanced dataset
balanced_data[numerical_columns] = balanced_data[numerical_columns].fillna(balanced_data[numerical_columns].mean())  # Fill numeric NaNs
balanced_data[categorical_columns] = balanced_data[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]))  # Fill with mode

print("\nBalanced Dataset Class Distribution:")
print(balanced_data['Target'].value_counts())

# Step 3: Sample size detection formula
def calculate_sample_size(z, p, e):
    numerator = (z ** 2) * p * (1 - p)
    denominator = e ** 2
    return math.ceil(numerator / denominator)

# Parameters
z = 1.96  # 95% confidence level
p = 0.5   # Proportion (assume 0.5 if unknown)
e_values = [0.05, 0.04, 0.03, 0.02, 0.01]  # Margins of error

# Calculate sample sizes for different margins of error
sample_sizes = [calculate_sample_size(z, p, e) for e in e_values]
print("\nSample Sizes for Different Margins of Error:", sample_sizes)

# Adjust sample sizes for finite population (optional)
def adjusted_sample_size(n, N):
    return math.ceil(n / (1 + (n - 1) / N))

N = len(balanced_data)
adjusted_sample_sizes = [adjusted_sample_size(n, N) for n in sample_sizes]
print("Adjusted Sample Sizes:", adjusted_sample_sizes)

# Step 4: Generate five samples using sampling techniques
samples = []

# Sampling techniques
# 1. Random Sampling
samples.append(balanced_data.sample(n=adjusted_sample_sizes[0], random_state=42))

# 2. Systematic Sampling
def systematic_sampling(data, step):
    indexes = range(0, len(data), step)
    return data.iloc[list(indexes)]

samples.append(systematic_sampling(balanced_data, step=N // adjusted_sample_sizes[1]))

# 3. Stratified Sampling
_, stratified_sample = train_test_split(
    balanced_data,
    test_size=adjusted_sample_sizes[2] / N,
    stratify=balanced_data['Target'],
    random_state=42
)
samples.append(stratified_sample)


# 4. Cluster Sampling
clusters = balanced_data.groupby('Target')
cluster_sample = pd.concat([
    clusters.get_group(1).sample(n=adjusted_sample_sizes[3] // 2, random_state=42),
    clusters.get_group(0).sample(n=adjusted_sample_sizes[3] // 2, random_state=42)
], axis=0)
samples.append(cluster_sample)


# 5. Bootstrap Sampling
samples.append(balanced_data.sample(n=adjusted_sample_sizes[4], replace=True, random_state=42))

# Step 5: Train models on the samples
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

results = []

for i, sample in enumerate(samples):
    # Split data
    X_sample = sample.iloc[:, :-1]
    y_sample = sample.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    # Models
    model_accuracies = {}

    # Model M1: Logistic Regression
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(max_iter=1000, random_state=42)
    model_accuracies['M1'] = train_and_evaluate_model(lr, X_train, X_test, y_train, y_test)

    # Model M2: Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    dt = DecisionTreeClassifier(random_state=42)
    model_accuracies['M2'] = train_and_evaluate_model(dt, X_train, X_test, y_train, y_test)

    # Model M3: Random Forest
    rf = RandomForestClassifier(random_state=42)
    model_accuracies['M3'] = train_and_evaluate_model(rf, X_train, X_test, y_train, y_test)

    # Model M4: Support Vector Machine
    from sklearn.svm import SVC
    svm = SVC(random_state=42)
    model_accuracies['M4'] = train_and_evaluate_model(svm, X_train, X_test, y_train, y_test)

    # Model M5: K-Nearest Neighbors
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier()
    model_accuracies['M5'] = train_and_evaluate_model(knn, X_train, X_test, y_train, y_test)

    results.append(model_accuracies)

# Step 6: Display results
print("\nModel Accuracies for Each Sampling Technique:")
for i, result in enumerate(results):
    print(f"Sampling Technique {i+1}: {result}")


Missing Values in Dataset:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Missing Values After Handling:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Dataset Columns: Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Model Accuracies for Each Sampling Technique:
Sampling Technique 1: {'M1': 0.9193548387096774, 'M2': 0.9032258064516129, 'M3': 0.9516129032258065, 'M4': 0.6451612903225806, 'M5': 0.8064516129032258}
Sampling Technique 2: {'M1': 0.8725490196078431, 'M2': 0.9901960784313726, 'M3': 1.0, 'M4': 0.7352941176470589, 'M5': 0.8235294117647058}
Sampling Technique 3: {'M1': 0.9365079365079365, 'M2': 0.9761904761904762, 'M3': 1.0, 'M4': 0.6587301587301587, 'M5': 0.873015873015873}
Sampling Technique 4: {'M1': 0.9518716577540107, 'M2': 0.9786096256684492, 'M3': 1.0, 'M4': 0.6470588235294118, 'M5': 0.8181818181818182}
Sampling Technique 5: {'M1': 0.9318181818181818, 'M2': 0.9848484848484849, 'M3': 1.0, 'M4': 0.6742424242424242, 'M5': 0.8712121212121212}
