**> 1)Veri setini hazırlama**

# KÜTÜPHANELER

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
import missingno as msno
from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('/kaggle/input/d/julnazz/diabetes-health-indicators-dataset/diabetes_012_health_indicators_BRFSS2021.csv')

In [None]:
df.isnull().any()

In [None]:
# Veri setinde rasgele eksik veriler oluşturalım.

import random

def add_random_missing_values(dataframe: pd.DataFrame,
                              missing_rate: float = 0.05,
                              seed: random = 42) -> pd.DataFrame:
    """Turns random values to NaN in a DataFrame.
    
    To use this function, you need to import pandas, numpy and random libraries.

    Args:
        dataframe (pd.DataFrame): DataFrame to be processed.
        missing_rate (float): Percentage of missing value rate in float format. Defaults 0.05

    
    """
    # Get copy of dataframe
    df_missing = dataframe.copy()

    # Obtain size of dataframe and number total number of missing values
    df_size = dataframe.size
    num_missing = int(df_size * missing_rate)
    
    # Set seed
    if seed:
        random.seed(seed)

    # Get random row and column indexes to turn them NaN
    for _ in range(num_missing):
        row_idx = random.randint(0, dataframe.shape[0] - 1)
        col_idx = random.randint(0, dataframe.shape[1] - 1)

        df_missing.iat[row_idx, col_idx] = np.nan
        
    return df_missing

df = add_random_missing_values(dataframe = df,
                               missing_rate = 0.03)

# 2)veriyi inceleme

In [None]:
df.shape

In [None]:
df.head(10) #ilk 10 veri

In [None]:
df.tail(10) #son 10 veri

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.dtypes.nunique()

In [None]:
df.dtypes

In [None]:
#numeric ve categorical kolonları görme
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(numeric_cols)
print(categorical_cols)

In [None]:
df.describe().T

# 3) Eksik verileri inceleme

In [None]:
#eksik değerler
df.isnull().sum()

In [None]:
#eksik olmayan veriler
df.notnull().sum()

In [None]:
msno.bar(df)

In [None]:
msno.heatmap(df)

# eksik veri doldurma

# veri seti kategorik değerleri float olarak göstermekte ,(Diğer sütunları kullanarak eksik değerleri tahmin etme) aşağıdaki kodu kullandığımda kategorik veri tipleri örneğin cinsiyet 0 veya 1 olması gerekirken 0-1 arasında numerik bir değer atıyor. Bu kodu burada kullanmayaağım, eksik değerleri sileceğim.
 
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    imputer_iterative = IterativeImputer(max_iter=10, random_state=0) # 10 iterasyon ile doldurma
    df_iterative = pd.DataFrame(imputer_iterative.fit_transform(df), columns=df.columns)

    print("\nİteratif İmpütasyon ile Doldurulmuş Veri:\n", df_iterative)

In [None]:
#eksik verileri silme
df = df.dropna(inplace = False)

In [None]:
df.shape

**eksik veriler silinince satır sayısıda düştü**

# 4) veri görselleştirme

In [None]:
df.hist(figsize = (15,15))
plt.show()

In [None]:
plt.figure(figsize=(5, 5))  # Grafik boyutunu ayarlayın
df['Diabetes_012'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)
plt.title('Diabet dağılımı')
plt.axis('equal')  # Dairesel bir pasta grafiği için
plt.show()

# değişkenlerin birbirine göre durumları

> # yaş ve diabet

In [None]:
sns.set(rc={"axes.facecolor":"#EAE7F9","figure.facecolor":"#EAE7F9"})
p=sns.catplot(x="Diabetes_012",y="Age", data=df, kind='box')
plt.title("Age and Diabet ", size=20, y=1.0);

In [None]:
pd.crosstab(df.Age,df.Diabetes_012)

> # yaş grubuna aralık vererek inceleyelim


In [None]:

plt.figure(figsize=(8, 6))
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 4, 6, 8, 10, np.inf], labels=['<30', '30-40', '40-50', '50-60', '60+'])
sns.countplot(x='AgeGroup', hue='Diabetes_012', data=df)
plt.title('yaş rubuna göre diabet')
plt.show()

yaş_diabet = df.groupby('AgeGroup')['Diabetes_012'].mean()
print(yaş_diabet)

# **artan yaş ile dibet düzeyinde de yükselme görülüyor**

In [None]:
#yeni oluşan ageGroup u siliyorum

df=df.drop(["AgeGroup"], axis=1)

> # kolestrol ve diabet

In [None]:
sns.set(rc={"axes.facecolor":"#EAE7F9","figure.facecolor":"#EAE7F9"})
p=sns.catplot(x="Diabetes_012",y="HighChol", data=df, kind='box')
plt.title("Yüksek kolestrol and Diabet ", size=20, y=1.0);

In [None]:
pd.crosstab(df.HighChol,df.Diabetes_012)

In [None]:

plt.figure(figsize=(8, 6))
sns.countplot(x='HighChol', hue='Diabetes_012', data=df)
plt.title('yüksek kolestrol ve diabet')
plt.show()

yüksekko_diabet = df.groupby('HighChol')['Diabetes_012'].mean()
print(yüksekko_diabet)

# ** yüksek kolestrolü olan kişilerin diabeti olma olasılığı daha yüksek**

# 5) outlier kontrolü

In [None]:
# Outliers
def outlier_thresholds(dataframe, col_name, th1=0.05, th3=0.95):
    quartile1 = dataframe[col_name].quantile(th1)
    quartile3 = dataframe[col_name].quantile(th3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def replace_with_thresholds(dataframe, col_name, th1=0.05, th3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name, th1, th3)
    if low_limit > 0:
        dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
    else:
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit

In [None]:
# Check Outliers
for col in df.columns:
    print(check_outlier(df, col))

# 6) KORELASYON

In [None]:
#correlation matrix
correlation = df.corr()

ax, fig = plt.subplots(figsize=(15,10))
sns.heatmap(correlation, vmin=-1, cmap='RdYlBu', annot=True)
plt.show()

> # 1 e yakın değerler olsaydı, aşırı uyum olabilirdi. 1 e yakın değerleri çıkarmamız gerekirdi

# 7) regresyon analizi

In [None]:
X = df.drop('Diabetes_012', axis = 1)
y = df['Diabetes_012']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,shuffle=False, test_size=0.25)

In [None]:
#RF
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

#rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier = RandomForestClassifier()

# Fit the model on the training data
rf_classifier.fit(X_train, y_train)


# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

rf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
print(rf_report)


# Create a confusion matrix
print(f"Confusion Matrix (test): \n {confusion_matrix(y_test, y_pred=y_pred)}\n")

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred=y_pred))
disp.plot()

> # accuracy değeri yüksek olsada classification reportta düşük değerler olduğunu görebiliyoruz.
> # farklı classifier modelleri ile denenebilir.
> # outlier değerler gözden geçirilip veriden çıkarılarak tekrar denenebilir.