# IMPORT DATA

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
print( df.shape )

# OUTLIERS DETECTION

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

# Load the data
data = pd.read_csv("data.csv")

# Select numerical columns for analysis
numerical_cols = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income']

# Create boxplots for numerical variables
sns.boxplot(data=data[numerical_cols] )


In [None]:
import numpy as np

def detect_outliers_zscore(data, threshold=3):

    mean = np.mean(data)
    std_dev = np.std(data)
    z_scores = [(x - mean) / std_dev for x in data]
    outlier_indices = np.where(np.abs(z_scores) > threshold)[0]
    return outlier_indices


for col in df.columns:

    # Example usage:
    data = df[f'{col}'].to_list()
    outliers_indices = detect_outliers_zscore(data)
    df[f'{col}'] = df[f'{col}'].drop(outliers_indices)
    print("Indices of outliers:", outliers_indices , len(outliers_indices) )

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
# # Calculate the min and max values of every column
# column_min_max = df.agg(['min', 'max'])

# column_min_max.to_csv('temo.csv' , index= False)

# print("Min and Max values of every column:")
# print(column_min_max)

# FEATURE ENGINEERING

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Load your dataset
data = df

# Separate features and target variable
x = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Ensure target variable contains non-negative values
# If it's a classification problem with binary classes, this step may not be necessary
if y.min() < 0:
    raise ValueError("Target variable contains negative values. Preprocessing required.")


# Feature Scaling with Min-Max Scaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform SelectKBest feature selection using chi2 (for classification)
k_best = SelectKBest(score_func=chi2, k=5)
X_train_kbest = k_best.fit_transform(X_train_scaled, y_train)
X_test_kbest = k_best.transform(X_test_scaled)

# Print the selected feature indices
print("Selected feature indices:", k_best.get_support(indices=True))

# Train a model with the selected features (e.g., Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_kbest, y_train)

# Evaluate model performance
accuracy = model.score(X_test_kbest, y_test)
print("Accuracy with selected features:", accuracy)


# DATA VISULIZATION AND DATA SELECTION

In [None]:
df.describe()

In [None]:
sns.heatmap(df.corr(),annot=True,fmt='0.1f',linewidth=.5)

In [None]:
sns.countplot(x='Diabetes_binary',data=df,palette=['g','r'])

In [None]:
plt.figure(figsize=(20,5))

plt.subplot(1,3,2)
plt.title('Distribution plot')
sns.distplot(df["Diabetes_binary"])

plt.show()

In [None]:
df.corr()
df.corr().to_csv('correlation.csv', index=False)


In [None]:
# PHYSICAL ACTIVITY ON OUTPUT

count_phys =0
count_nonphy=0

count_1 =0

print(df.shape)

row_count = df.shape[0]




count_ones = df[(df['Diabetes_binary'] == 0) & (df['PhysActivity'] == 0)].shape[0]
count_nonphy = count_phys + count_ones
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 0) & (df['PhysActivity'] == 1)].shape[0]
count_phys = count_phys + count_ones
print(count_ones/row_count)


count_ones = df[(df['Diabetes_binary'] == 1) & (df['PhysActivity'] == 1)].shape[0]
count_phys = count_phys + count_ones
print(count_ones/row_count)


print( "physics with diabates" , count_ones/count_phys   )


count_ones = df[(df['Diabetes_binary'] == 1) & (df['PhysActivity'] == 0)].shape[0]
count_nonphy = count_phys + count_ones
print(count_ones/row_count)

print( "non-physics with diabates" ,  count_ones/count_nonphy  )



In [None]:
# PHYSICAL ACTIVITY ON OUTPUT

print(df.shape)

row_count = df.shape[0]

count_ones = df[(df['Diabetes_binary'] == 1) & (df['GenHlth'] == 1)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 1) & (df['GenHlth'] == 2)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 1) & (df['GenHlth'] == 3)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 1) & (df['GenHlth'] == 4)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 1) & (df['GenHlth'] == 5)].shape[0]
print(count_ones/row_count)

print(".....")

count_ones = df[(df['Diabetes_binary'] == 0) & (df['GenHlth'] == 1)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 0) & (df['GenHlth'] == 2)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 0) & (df['GenHlth'] == 3)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 0) & (df['GenHlth'] == 4)].shape[0]
print(count_ones/row_count)
count_ones = df[(df['Diabetes_binary'] == 0) & (df['GenHlth'] == 5)].shape[0]
print(count_ones/row_count)



# LOGESTIC REGRESSION

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
x = df.drop('Diabetes_binary' , axis=1)
y = df['Diabetes_binary']

In [None]:
# splitting of data

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=1)

In [None]:

# Splitting the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Standardizing the features (optional but recommended for logistic regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating a logistic regression model
logreg = LogisticRegression()

# Training the model on the training data
logreg.fit(X_train, y_train)

# Making predictions on the test data
y_pred = logreg.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# RANDOM FOREST REGRESSION

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Creating a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Training the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# XG BOOST

In [None]:
import xgboost as xgb

In [None]:

# Creating a Gradient Boosting Machine classifier
gbm_classifier = xgb.XGBClassifier(random_state=42  )

# Training the classifier on the training data
gbm_classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = gbm_classifier.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:


# Save the trained model to a file
filename = 'xgb_trained_model.model'
gbm_classifier.save_model(filename)

# LOGESTIC REGRESSION ( SELECTED ATTRIBUTES)

In [None]:
x = df[ ['HighBP' , 'HighChol', 'BMI' ,'GenHlth' , 'DiffWalk' , 'HeartDiseaseorAttack' , 'PhysHlth' , 'Age' , 'AnyHealthcare' ]  ]
y = df['Diabetes_binary']

In [None]:
# splitting of data

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=1)

In [None]:

# Splitting the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Standardizing the features (optional but recommended for logistic regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating a logistic regression model
logreg = LogisticRegression()

# Training the model on the training data
logreg.fit(X_train, y_train)

# Making predictions on the test data
y_pred = logreg.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# XGBOOST (SELECTED ATTRIBUTES)

In [None]:

# Creating a Gradient Boosting Machine classifier
gbm_classifier = xgb.XGBClassifier(random_state=42)

# Training the classifier on the training data
gbm_classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = gbm_classifier.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# XGBOOST WITH FEATURE ENGENEERING

In [None]:
x = df.iloc[:, [ 1 ,13 ,15, 16]] 
y = df.iloc[:, [0]] 

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)

In [None]:

# Creating a Gradient Boosting Machine classifier
gbm_classifier = xgb.XGBClassifier(random_state=42)

# Training the classifier on the training data
gbm_classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = gbm_classifier.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))