<a href="https://colab.research.google.com/github/Kunal-code-u/Sampling_assignment/blob/main/Sampling_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

# Load dataset
url = "/content/Creditcard_data.csv"
data = pd.read_csv(url)

# Separate features and target variable
X = data.drop(columns=['Class'])
y = data['Class']

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine balanced dataset
balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)

# Sample size calculations
confidence_level = 0.95
margin_of_error = 0.05
p = y_resampled.mean()
z = 1.96  # Z-score for 95% confidence

# Random Sampling sample size
random_sample_size = int((z*2 * p * (1 - p)) / (margin_of_error*2))

# Stratified Sampling sample size (proportional to strata variance)
strata_weight = balanced_data['Class'].value_counts(normalize=True).std()
# Avoid division by zero
if strata_weight == 0:
    strata_weight = 1  # Assign a default value (e.g., 1 to prevent division by zero)
stratified_sample_size = int((z*2 * p * (1 - p)) / ((margin_of_error / strata_weight)*2))

# Cluster Sampling sample size (based on clusters)
num_clusters = 5
cluster_sample_size = int((z*2 * p * (1 - p)) / ((margin_of_error / num_clusters)*2))

# Sampling Techniques
samples = {}

# Random Sampling
samples['Random'] = balanced_data.sample(n=random_sample_size, random_state=42)

# Stratified Sampling - Exclude the grouping column
samples['Stratified'] = balanced_data.groupby('Class').apply(
    lambda x: x.sample(int(stratified_sample_size * len(x) / len(balanced_data)), replace=True, random_state=42)
).reset_index(drop=True)


# Systematic Sampling
k = len(balanced_data) // random_sample_size
samples['Systematic'] = balanced_data.iloc[::k, :].reset_index(drop=True)

# Cluster Sampling
balanced_data['Cluster'] = pd.cut(balanced_data['Time'], bins=num_clusters, labels=False)
selected_clusters = np.random.choice(balanced_data['Cluster'].unique(), size=num_clusters // 2, replace=False)
samples['Cluster'] = balanced_data[balanced_data['Cluster'].isin(selected_clusters)].reset_index(drop=True)

# Bootstrap Sampling
samples['Bootstrap'] = balanced_data.sample(n=random_sample_size, replace=True, random_state=42)

# Machine Learning Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
}

# Train and evaluate models
results = {}
for sample_name, sample_data in samples.items():
    X_sample = sample_data.drop(columns=['Class', 'Cluster'], errors='ignore')
    y_sample = sample_data['Class']
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        accuracy = model.score(X_test, y_test)
        if sample_name not in results:
            results[sample_name] = {}
        results[sample_name][model_name] = accuracy

# Find the overall best sampling technique and model combination
highest_accuracy = 0
best_sampling_technique = ""
best_model = ""

for sample_name, model_results in results.items():
    for model_name, accuracy in model_results.items():
        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_sampling_technique = sample_name
            best_model = model_name

# Output the best combination
print("Sampling Technique with the Highest Accuracy:")
print(f"Sampling Technique: {best_sampling_technique}")
print(f"Model: {best_model}")
print(f"Accuracy: {highest_accuracy:.2f}")

  samples['Stratified'] = balanced_data.groupby('Class').apply(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Sampling Technique with the Highest Accuracy:
Sampling Technique: Random
Model: DecisionTree
Accuracy: 1.00
