In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from imblearn.over_sampling import RandomOverSampler

In [2]:
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

X = df.drop("Class", axis=1)
y = df["Class"]
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

data = pd.concat([X_bal, y_bal], axis=1)


In [3]:
def simple_random(data, n):
    return data.sample(n)

def systematic(data, n):
    k = len(data) // n
    return data.iloc[::k].head(n)

def stratified(data, n):
    return data.groupby("Class", group_keys=False).apply(lambda x: x.sample(n//2))

def cluster(data, n):
    kmeans = KMeans(n_clusters=5, random_state=42)
    data["cluster"] = kmeans.fit_predict(data.drop("Class", axis=1))
    sampled = data.groupby("cluster", group_keys=False).apply(lambda x: x.sample(min(len(x), n//5)))
    return sampled.drop("cluster", axis=1)

def bootstrap(data, n):
    return data.sample(n, replace=True)

In [4]:
models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC()
}
def cv_score(model, X, y):
    skf = StratifiedKFold(n_splits=5)
    scores = []
    for train, test in skf.split(X, y):
        model.fit(X.iloc[train], y.iloc[train])
        scores.append(model.score(X.iloc[test], y.iloc[test]))
        print("Fold accuracies:", scores)
    return np.mean(scores)
test_model = LogisticRegression(max_iter=1000)
score = cv_score(test_model, X_bal, y_bal)
print("Mean CV Accuracy:", score)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.8986928104575164]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.8986928104575164, 0.9344262295081968]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.8986928104575164, 0.9344262295081968, 0.9245901639344263]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.8986928104575164, 0.9344262295081968, 0.9245901639344263, 0.9049180327868852]
Fold accuracies: [0.8986928104575164, 0.9344262295081968, 0.9245901639344263, 0.9049180327868852, 0.9147540983606557]
Mean CV Accuracy: 0.915476267009536


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
n = 1000
samplings = {
    "Random": simple_random(data, n),
    "Systematic": systematic(data, n),
    "Stratified": stratified(data, n),
    "Cluster": cluster(data.copy(), n),
    "Bootstrap": bootstrap(data, n)
}

results = {}

for model_name, model in models.items():
    results[model_name] = {}
    for samp_name, samp in samplings.items():
        Xs = samp.drop("Class", axis=1)
        ys = samp["Class"]
        results[model_name][samp_name] = cv_score(model, Xs, ys)


  return data.groupby("Class", group_keys=False).apply(lambda x: x.sample(n//2))
  sampled = data.groupby("cluster", group_keys=False).apply(lambda x: x.sample(min(len(x), n//5)))
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.955]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.955, 0.92]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.955, 0.92, 0.91]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.955, 0.92, 0.91, 0.9]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.955, 0.92, 0.91, 0.9, 0.95]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.895]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.895, 0.845]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.895, 0.845, 0.9]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.895, 0.845, 0.9, 0.825]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.895, 0.845, 0.9, 0.825, 0.84]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.945]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.945, 0.935]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.945, 0.935, 0.965]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.945, 0.935, 0.965, 0.9]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.945, 0.935, 0.965, 0.9, 0.915]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.896551724137931]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.896551724137931, 0.5862068965517241]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.896551724137931, 0.5862068965517241, 0.8689655172413793]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.896551724137931, 0.5862068965517241, 0.8689655172413793, 0.8206896551724138]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.896551724137931, 0.5862068965517241, 0.8689655172413793, 0.8206896551724138, 0.8137931034482758]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.93]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.93, 0.905]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.93, 0.905, 0.915]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.93, 0.905, 0.915, 0.925]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold accuracies: [0.93, 0.905, 0.915, 0.925, 0.925]
Fold accuracies: [0.96]
Fold accuracies: [0.96, 0.975]
Fold accuracies: [0.96, 0.975, 0.98]
Fold accuracies: [0.96, 0.975, 0.98, 0.975]
Fold accuracies: [0.96, 0.975, 0.98, 0.975, 0.965]
Fold accuracies: [0.7]
Fold accuracies: [0.7, 0.655]
Fold accuracies: [0.7, 0.655, 1.0]
Fold accuracies: [0.7, 0.655, 1.0, 0.675]
Fold accuracies: [0.7, 0.655, 1.0, 0.675, 0.445]
Fold accuracies: [0.99]
Fold accuracies: [0.99, 0.97]
Fold accuracies: [0.99, 0.97, 0.99]
Fold accuracies: [0.99, 0.97, 0.99, 0.975]
Fold accuracies: [0.99, 0.97, 0.99, 0.975, 0.95]
Fold accuracies: [0.7172413793103448]
Fold accuracies: [0.7172413793103448, 0.993103448275862]
Fold accuracies: [0.7172413793103448, 0.993103448275862, 0.9586206896551724]
Fold accuracies: [0.7172413793103448, 0.993103448275862, 0.9586206896551724, 0.7931034482758621]
Fold accuracies: [0.7172413793103448, 0.993103448275862, 0.9586206896551724, 0.7931034482758621, 0.6137931034482759]
Fold accuracie

In [6]:
result_df = pd.DataFrame(results).T
result_df


Unnamed: 0,Random,Systematic,Stratified,Cluster,Bootstrap
Logistic,0.927,0.861,0.932,0.797241,0.92
KNN,0.971,0.695,0.975,0.815172,0.979
DecisionTree,0.993,0.85,0.995,0.921379,0.995
RandomForest,1.0,0.999,1.0,0.94069,0.999
SVM,0.722,0.598,0.697,0.651034,0.724
