In [1]:
from rsw import *
# data analysis and wrangling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy as sp
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

### Read data

In [2]:
data = pd.read_csv('./data/processed/pima_diabetes/diabetes.csv') 

In [3]:
data.head()  #displaying the head of dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#train_test_splitting of the dataset
x = data.drop(columns = 'Outcome')
# Getting Predicting Value
y = data['Outcome']

In [5]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [6]:
data['Outcome'].mean()

0.3489583333333333

### Match diabetes and non-diabetes means

In [7]:
losses_0 = [
    losses.EqualityLoss(3.30),
    losses.EqualityLoss(110.0),
    losses.EqualityLoss(68.2),
    losses.EqualityLoss(19.7),
    losses.EqualityLoss(68.8),
    losses.EqualityLoss(30.3),
    losses.EqualityLoss(0.429),
    losses.EqualityLoss(31.19)
]

losses_1 = [
    losses.EqualityLoss(4.87),
    losses.EqualityLoss(141.25),
    losses.EqualityLoss(70.82),
    losses.EqualityLoss(22.16),
    losses.EqualityLoss(100.3),
    losses.EqualityLoss(35.14),
    losses.EqualityLoss(0.55),
    losses.EqualityLoss(37.07)
]

### Re-weight

In [10]:
regularizer = regularizers.EntropyRegularizer(limit=None)
w_0, out_0, sol_0 = rsw(x, None, losses_0, regularizer,
                      1, verbose=False, rho=75, eps_abs=1e-6, eps_rel=1e-6)

In [12]:
regularizer = regularizers.EntropyRegularizer(limit=20)
w_1, out_1, sol_1 = rsw(x, None, losses_1, regularizer,
                      1, verbose=False, rho=50, eps_abs=1e-6, eps_rel=1e-6)

In [13]:
x_0 = x.copy()
x_0["weights"] = w_0
x_1 = x.copy()
x_1["weights"] = w_1


# Set theoretical outcome to train on reweighted datasets
x_0['Outcome'] = 0
x_1['Outcome'] = 1

In [14]:
# Combine both reweight datasets together
weighted_data = pd.concat([x_0,x_1])
#train_test_splitting of the dataset

x_w = weighted_data.drop(columns = ['Outcome'])

# Getting Predicting Value
y_w = weighted_data['Outcome']

# Test train split
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test = train_test_split(x_w,y_w,test_size=0.2,random_state=0)
w_train = x_train["weights"]
w_test = x_test["weights"]
x_train = x_train.drop(columns = 'weights')
x_test = x_test.drop(columns = 'weights')

In [18]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(C=1000)
# Train on sythetic reweighted dataset
reg.fit(x_train,y_train,sample_weight=w_train)

# Predict on original dataset as a measure of performance
y_pred=reg.predict(x)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y,y_pred))
print("Training Score:\n",reg.score(x_train,y_train,sample_weight=w_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y,y_pred))
print("R2 score is:\n",r2_score(y,y_pred))

print(accuracy_score())
print(accuracy_score(y,y_pred)*100)

Classification Report is:
               precision    recall  f1-score   support

           0       0.85      0.75      0.80       500
           1       0.62      0.75      0.68       268

    accuracy                           0.75       768
   macro avg       0.74      0.75      0.74       768
weighted avg       0.77      0.75      0.76       768

Confusion Matrix:
 [[376 124]
 [ 66 202]]
Training Score:
 72.12123801441557
Mean Squared Error:
 0.24739583333333334
R2 score is:
 -0.08895522388059685
75.26041666666666


In [19]:
from sklearn.svm import SVC

svc = SVC(C=10000)
svc.fit(x_train, y_train, sample_weight=w_train)

y_pred=svc.predict(x)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y,y_pred))
print("Training Score:\n",reg.score(x_train,y_train,sample_weight=w_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y,y_pred))
print("R2 score is:\n",r2_score(y_pred,y))

print(accuracy_score(y_pred,y)*100)

Classification Report is:
               precision    recall  f1-score   support

           0       0.82      0.79      0.80       500
           1       0.63      0.68      0.66       268

    accuracy                           0.75       768
   macro avg       0.73      0.73      0.73       768
weighted avg       0.76      0.75      0.75       768

Confusion Matrix:
 [[393 107]
 [ 85 183]]
Training Score:
 72.12123801441557
Mean Squared Error:
 0.25
R2 score is:
 -0.06374260568460555
75.0
