<a href="https://colab.research.google.com/github/Kavish1504/Sampling-Techniques/blob/main/102317012_Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from sklearn.cluster import KMeans
data=pd.read_csv(
    "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
)
X=data.drop("Class", axis=1)
y=data["Class"]
ros=RandomOverSampler(random_state=42)
X_bal, y_bal=ros.fit_resample(X, y)
balanced_df=pd.concat([X_bal, y_bal], axis=1)
def simple_random_sampling(df, frac=0.3, seed=42):
    return df.sample(frac=frac, random_state=seed)

def stratified_sampling(df, frac=0.3, seed=42):
    return df.groupby("Class", group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=seed)
    )

def systematic_sampling(df, step=3):
    return df.iloc[::step]

def cluster_sampling(df, n_clusters=10, clusters_to_pick=3, seed=42):
    features=df.drop("Class", axis=1)
    kmeans=KMeans(n_clusters=n_clusters, random_state=seed)

    df = df.copy()
    df["cluster"]=kmeans.fit_predict(features)

    chosen_clusters=np.random.RandomState(seed).choice(
        n_clusters, clusters_to_pick, replace=False
    )

    return df[df["cluster"].isin(chosen_clusters)].drop("cluster", axis=1)

def bootstrap_sampling(df, n_samples, seed=42):
    return df.sample(n=n_samples, replace=True, random_state=seed)

samples={
    "Sampling1_SimpleRandom": simple_random_sampling(balanced_df, 0.3),
    "Sampling2_Stratified": stratified_sampling(balanced_df, 0.3),
    "Sampling3_Systematic": systematic_sampling(balanced_df, 3),
    "Sampling4_Cluster": cluster_sampling(balanced_df, 10, 3),
    "Sampling5_Bootstrap": bootstrap_sampling(
        balanced_df, int(0.2 * len(balanced_df))
    ),
}
models={
    "Model1": LogisticRegression(max_iter=5000),
    "Model2": DecisionTreeClassifier(),
    "Model3": RandomForestClassifier(),
    "Model4": GaussianNB(),
    "Model5": SVC(),
}

results=pd.DataFrame(index=models.keys(), columns=samples.keys())
for samp_key, samp_df in samples.items():

    X=samp_df.drop("Class", axis=1)
    y=samp_df["Class"]

    X_train, X_test, y_train, y_test=train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    scaler=StandardScaler()
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)

    for model_key, model in models.items():
        if model_key in ["M1", "M5"]:
            model.fit(X_train_scaled, y_train)
            y_pred=model.predict(X_test_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        results.loc[model_key, samp_key]=round(
            accuracy_score(y_test, y_pred) * 100, 2
        )

print("Accuracy Table:")
print(results)
best_sampling=results.astype(float).idxmax(axis=1)
print("Best Sampling Technique for Each Model:")
print(best_sampling)


  return df.groupby("Class", group_keys=False).apply(


Accuracy Table:
       Sampling1_SimpleRandom Sampling2_Stratified Sampling3_Systematic  \
Model1                  88.41                92.75                87.58   
Model2                   97.1                97.83                98.69   
Model3                  97.83                100.0                100.0   
Model4                  67.39                70.29                 71.9   
Model5                  69.57                68.84                72.55   

       Sampling4_Cluster Sampling5_Bootstrap  
Model1             96.83               95.65  
Model2             100.0               98.91  
Model3             100.0               100.0  
Model4             100.0               77.17  
Model5             88.89               73.91  
Best Sampling Technique for Each Model:
Model1       Sampling4_Cluster
Model2       Sampling4_Cluster
Model3    Sampling2_Stratified
Model4       Sampling4_Cluster
Model5       Sampling4_Cluster
dtype: object
