In [72]:
import pandas as pd

csv_file_path = 'diabetes.csv'
diabetes_data = pd.read_csv(csv_file_path)


In [73]:
# Display the first few rows of the dataset
print(diabetes_data.head())

# Display column names
print("Column Names:")
print(diabetes_data.columns)


   Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0   40   Male       No        Yes                 No      Yes         No   
1   58   Male       No         No                 No      Yes         No   
2   41   Male      Yes         No                 No      Yes        Yes   
3   45   Male       No         No                Yes      Yes        Yes   
4   60   Male      Yes        Yes                Yes      Yes        Yes   

  Genital thrush visual blurring Itching Irritability delayed healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No              No     Yes           No             Yes   
3            Yes              No     Yes           No             Yes   
4             No             Yes     Yes          Yes             Yes   

  partial paresis muscle stiffness Alopecia Obesity     class  
0              No              Yes      

In [74]:
# Check for missing values
print("Missing Values:")
print(diabetes_data.isnull().sum())

Missing Values:
Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64


In [75]:
# Check data types
print("Data Types:")
print(diabetes_data.dtypes)


Data Types:
Age                    int64
Gender                object
Polyuria              object
Polydipsia            object
sudden weight loss    object
weakness              object
Polyphagia            object
Genital thrush        object
visual blurring       object
Itching               object
Irritability          object
delayed healing       object
partial paresis       object
muscle stiffness      object
Alopecia              object
Obesity               object
class                 object
dtype: object


In [76]:
from sklearn.preprocessing import LabelEncoder

# Convert 'class' variable to numerical values (0 and 1)
label_encoder = LabelEncoder()
diabetes_data['class'] = label_encoder.fit_transform(diabetes_data['class'])


In [77]:
# Remove leading and trailing whitespaces from column names
diabetes_data.columns = diabetes_data.columns.str.strip()

# Convert binary categorical variables to numerical values
binary_columns = ['Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity']
diabetes_data[binary_columns] = diabetes_data[binary_columns].apply(lambda x: x.map({'Yes': 1, 'No': 0}))


In [78]:
# Convert 'Gender' variable to one-hot encoding
diabetes_data = pd.get_dummies(diabetes_data, columns=['Gender'])


In [79]:
# Check updated data types
print("Updated Data Types:")
print(diabetes_data.dtypes)


Updated Data Types:
Age                   int64
Polyuria              int64
Polydipsia            int64
sudden weight loss    int64
weakness              int64
Polyphagia            int64
Genital thrush        int64
visual blurring       int64
Itching               int64
Irritability          int64
delayed healing       int64
partial paresis       int64
muscle stiffness      int64
Alopecia              int64
Obesity               int64
class                 int32
Gender_Female         uint8
Gender_Male           uint8
dtype: object


In [80]:
# Check class distribution
print("Class Distribution:")
print(diabetes_data['class'].value_counts())

Class Distribution:
1    320
0    200
Name: class, dtype: int64


In [81]:
# Separate features and target variable
X = diabetes_data.drop('class', axis=1)
y = diabetes_data['class']

In [82]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [83]:
# Correlation matrix
correlation_matrix = diabetes_data.corr()

# Correlation with the target variable
correlation_with_target = correlation_matrix['class'].sort_values(ascending=False)
selected_features_corr = correlation_with_target[abs(correlation_with_target) > 0.1].index

# Display selected features based on correlation
print("Selected Features based on Correlation:")
print(selected_features_corr)


Selected Features based on Correlation:
Index(['class', 'Polyuria', 'Polydipsia', 'Gender_Female',
       'sudden weight loss', 'partial paresis', 'Polyphagia', 'Irritability',
       'visual blurring', 'weakness', 'muscle stiffness', 'Genital thrush',
       'Age', 'Alopecia', 'Gender_Male'],
      dtype='object')


Recursive Feature Elimination (RFE)

In [87]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier()

# Separate features and target variable
X = diabetes_data.drop('class', axis=1)
y = diabetes_data['class']

# Perform RFE
rfe = RFE(rf_classifier, n_features_to_select=10)
# Fit RFE to the data
rfe.fit(X, y)

# Filter the dataset with selected features
selected_features_rfe = X.columns[rfe.support_]

# Display selected features based on RFE
print("Selected Features based on RFE:")
print(selected_features_rfe)


Selected Features based on RFE:
Index(['Age', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'Irritability',
       'delayed healing', 'partial paresis', 'Alopecia', 'Gender_Female',
       'Gender_Male'],
      dtype='object')


Feature Engineering:
    Combine features to create interaction terms.

In [88]:
# Example: Interaction feature between 'Polyuria' and 'Polydipsia'
diabetes_data['Polyuria_Polydipsia'] = diabetes_data['Polyuria'] * diabetes_data['Polydipsia']


In [89]:
# Example: Age binning
bins = [0, 30, 50, 100]
labels = ['Young', 'Middle-Aged', 'Old']
diabetes_data['Age_Group'] = pd.cut(diabetes_data['Age'], bins=bins, labels=labels, right=False)


In [90]:
# Example: Creating a feature based on medical knowledge
# If a person has both 'Polyuria' and 'Polydipsia', it might indicate a higher risk
diabetes_data['Polyuria_Polydipsia'] = diabetes_data['Polyuria'] & diabetes_data['Polydipsia']


In [91]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = diabetes_data.drop('class', axis=1)
y = diabetes_data['class']

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [92]:
from sklearn.model_selection import cross_val_score

# Example: Cross-validation with a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5, scoring='accuracy')

# Display cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\ensemble\_forest.py", line 348, in fit
    X, y = self._validate_data(
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\validation.py", line 1146, in check_X_y
    X = check_array(
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "d:\snakes\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Middle-Aged'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\ensemble\_forest.py", line 348, in fit
    X, y = self._validate_data(
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\validation.py", line 1146, in check_X_y
    X = check_array(
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\Global Village\AppData\Roaming\Python\Python310\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "d:\snakes\lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Young'
