In [2]:
#OVERSAMPLING
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
# Load dataset
col_names = ['pregnant','glucose','bp','skin','insulin','bmi','pedigree','age','label']
pima = pd.read_csv("diabetes.csv", header=None, names=col_names)
pima = pima.tail(-1)

In [4]:
# Split dataset into features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable

In [5]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [6]:
# Duplicate instances from the minority class
minority_class = pd.concat([X_train, y_train], axis=1)[pd.concat([X_train, y_train], axis=1)['label'] == 1]

# Check if the number of available samples is greater than the desired sample size
if len(minority_class) < 500:
    oversampled_minority_class = minority_class.sample(n=len(minority_class), replace=True, random_state=42)
else:
    oversampled_minority_class = minority_class.sample(n=500, replace=True, random_state=42)

X_train_dup = pd.concat([X_train, oversampled_minority_class.drop('label', axis=1)])
y_train_dup = pd.concat([y_train, oversampled_minority_class['label']])

In [7]:
# Create Decision Tree classifier object
clf = DecisionTreeClassifier()

In [8]:
# Train Decision Tree classifier
clf = clf.fit(X_train_dup, y_train_dup)

In [9]:
# Predict the response for test dataset
y_pred = clf.predict(X_test)

In [10]:
# Model Accuracy
print("Duplication Accuracy:", metrics.accuracy_score(y_test, y_pred))
# Print classification report
print("Duplication Classification Report:")
print(metrics.classification_report(y_test, y_pred))

Duplication Accuracy: 0.658008658008658
Duplication Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.77      0.74       146
           1       0.54      0.46      0.50        85

    accuracy                           0.66       231
   macro avg       0.63      0.62      0.62       231
weighted avg       0.65      0.66      0.65       231

