In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import FunctionTransformer


In [2]:
import os

# Get the number of CPU cores
num_cores = os.cpu_count()

print(f"Number of CPU cores: {num_cores}")


Number of CPU cores: 4


In [3]:
train_df = pd.read_csv(r"artifacts\train.csv")
train_df.shape  
train_df.head() 

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,outcome
0,42,F,f,f,f,f,f,f,f,f,...,t,139.0,t,1.04,t,134.0,f,,other,P
1,63,M,f,f,f,f,f,f,f,f,...,t,102.0,t,1.18,t,87.0,f,,SVI,P
2,40,M,f,f,f,f,f,f,f,f,...,t,105.0,t,0.83,t,125.0,f,,SVI,P
3,34,F,f,f,f,f,f,f,f,f,...,t,104.0,t,1.13,t,92.0,f,,SVHC,P
4,50,F,f,f,f,f,f,f,f,t,...,t,98.0,t,0.8,t,122.0,f,,SVI,P


In [4]:
print("Columns before dropping:", train_df.columns)


Columns before dropping: Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
       'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
       'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
       'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
       'FTI measured', 'FTI', 'TBG measured', 'TBG', 'referral source',
       'outcome'],
      dtype='object')


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2829 entries, 0 to 2828
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        2829 non-null   int64  
 1   sex                        2721 non-null   object 
 2   on thyroxine               2829 non-null   object 
 3   query on thyroxine         2829 non-null   object 
 4   on antithyroid medication  2829 non-null   object 
 5   sick                       2829 non-null   object 
 6   pregnant                   2829 non-null   object 
 7   thyroid surgery            2829 non-null   object 
 8   I131 treatment             2829 non-null   object 
 9   query hypothyroid          2829 non-null   object 
 10  query hyperthyroid         2829 non-null   object 
 11  lithium                    2829 non-null   object 
 12  goitre                     2829 non-null   object 
 13  tumor                      2829 non-null   objec

In [6]:
# Assuming train_df is your DataFrame
print(train_df[['TSH', 'T3', 'TT4', 'FTI']].info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2829 entries, 0 to 2828
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TSH     2550 non-null   float64
 1   T3      2243 non-null   float64
 2   TT4     2654 non-null   float64
 3   FTI     2544 non-null   float64
dtypes: float64(4)
memory usage: 88.5 KB
None


In [7]:
test_df = pd.read_csv(r"artifacts\test.csv")
test_df
print(test_df[['TSH', 'T3', 'TT4', 'FTI']].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TSH     853 non-null    float64
 1   T3      760 non-null    float64
 2   TT4     887 non-null    float64
 3   FTI     843 non-null    float64
dtypes: float64(4)
memory usage: 29.6 KB
None


In [8]:
# Separate numerical and categorical columns
from sklearn.preprocessing import OneHotEncoder


numerical_columns = ['TSH', 'T3', 'TT4', 'FTI']
categorical_columns = ['sex', 'on thyroxine', 'query hypothyroid', 'psych', 'TSH measured', 'pregnant']

# Define numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# Define categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

# Combine the numerical and categorical transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_columns),
        ('cat', cat_pipeline, categorical_columns)
    ]
)



In [9]:
outcome="outcome"

In [10]:
# Drop specified columns
drop_columns = [
    "goitre", "referral source", "on antithyroid medication", "thyroid surgery",
    "T3 measured", "TT4 measured", "query hyperthyroid", "age", "query on thyroxine",
    "lithium", "T4U measured", "T4U", "FTI measured", "hypopituitary", "tumor",
    "I131 treatment", "sick", "TBG measured", "TBG","outcome"
]

In [11]:
if 'outcome' in train_df.columns:
    print("'outcome' is present in the DataFrame")
else:
    print("'outcome' is not present in the DataFrame")


'outcome' is present in the DataFrame


In [35]:
# Use the preprocessor to fit_transform directly on the DataFrame
input_feature_train_df = train_df.drop(columns=drop_columns,axis=1) 
input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)

# Extract column names after one-hot encoding
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_columns)

# Combine numerical and categorical feature names
feature_names = numerical_columns + list(cat_feature_names)

# Convert the transformed array back to DataFrame with column names
input_feature_train_df_transformed = pd.DataFrame(input_feature_train_arr, columns=feature_names)
input_feature_train_df_transformed.head()

#input_feature_train_arr

Unnamed: 0,TSH,T3,TT4,FTI,sex_F,sex_M,on thyroxine_f,on thyroxine_t,query hypothyroid_f,query hypothyroid_t,psych_f,psych_t,TSH measured_f,TSH measured_t,pregnant_f,pregnant_t
0,-0.133075,-0.558446,0.93141,0.785311,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.039602,-0.146112,-0.162566,-0.727672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,-0.133075,0.541111,-0.073865,0.495591,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,-0.128963,-0.146112,-0.103432,-0.566716,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,-0.001511,-1.932892,-0.280834,0.399017,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0


In [13]:
input_feature_train_df

Unnamed: 0,sex,on thyroxine,pregnant,query hypothyroid,psych,TSH measured,TSH,T3,TT4,FTI
0,F,f,f,f,f,t,1.600,1.6,139.0,134.0
1,M,f,f,f,f,t,5.800,1.9,102.0,87.0
2,M,f,f,f,f,t,1.600,2.4,105.0,125.0
3,F,f,f,f,t,t,1.700,1.9,104.0,92.0
4,F,f,f,t,f,t,4.800,0.6,98.0,122.0
...,...,...,...,...,...,...,...,...,...,...
2824,M,f,f,f,f,t,0.300,2.1,122.0,150.0
2825,F,f,f,f,f,t,0.200,3.8,252.0,222.0
2826,F,f,f,f,f,t,2.300,2.5,121.0,94.0
2827,F,f,f,f,f,t,0.035,3.1,142.0,97.0


In [14]:
isinstance(input_feature_train_df,pd.DataFrame)

True

In [15]:
## target col  y train
# Encode the target column
label_encoder = LabelEncoder()
target_feature_train_df = label_encoder.fit_transform(train_df[outcome])  # y col
target_feature_train_df = pd.DataFrame(target_feature_train_df)
target_feature_train_df ## y train 

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
2824,1
2825,1
2826,1
2827,1


In [16]:
target_feature_train_df.shape

(2829, 1)

In [36]:

## X test 
# Use the preprocessor to transform the DataFrame
input_feature_test_df = test_df.drop(columns=drop_columns, axis=1)
input_feature_test_arr = preprocessor.transform(input_feature_test_df)

# Convert the transformed array back to DataFrame with column names
input_feature_test_df_transformed = pd.DataFrame(input_feature_test_arr, columns=feature_names)
input_feature_test_df_transformed.shape
#input_feature_test_arr

(943, 16)

In [18]:
input_feature_test_arr.shape

(943, 16)

In [19]:
help(preprocessor)

Help on ColumnTransformer in module sklearn.compose._column_transformer object:

class ColumnTransformer(sklearn.base.TransformerMixin, sklearn.utils.metaestimators._BaseComposition)
 |  ColumnTransformer(transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, transformer_weights=None, verbose=False, verbose_feature_names_out=True)
 |  
 |  Applies transformers to columns of an array or pandas DataFrame.
 |  
 |  This estimator allows different columns or column subsets of the input
 |  to be transformed separately and the features generated by each transformer
 |  will be concatenated to form a single feature space.
 |  This is useful for heterogeneous or columnar data, to combine several
 |  feature extraction mechanisms or transformations into a single transformer.
 |  
 |  Read more in the :ref:`User Guide <column_transformer>`.
 |  
 |  .. versionadded:: 0.20
 |  
 |  Parameters
 |  ----------
 |  transformers : list of tuples
 |      List of (name, transformer, colu

In [20]:
## target col  y test
# Encode the target column

target_feature_test_df = target_feature_test_df=train_df[outcome]
label_encoder = LabelEncoder()
target_feature_test_df = label_encoder.fit_transform(test_df[outcome])  # y col
target_feature_test_df = pd.DataFrame(target_feature_test_df)
target_feature_test_df ## y test 


Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
...,...
938,1
939,1
940,1
941,1


In [21]:
target_feature_test_df.shape

(943, 1)

In [22]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2829 entries, 0 to 2828
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        2829 non-null   int64  
 1   sex                        2721 non-null   object 
 2   on thyroxine               2829 non-null   object 
 3   query on thyroxine         2829 non-null   object 
 4   on antithyroid medication  2829 non-null   object 
 5   sick                       2829 non-null   object 
 6   pregnant                   2829 non-null   object 
 7   thyroid surgery            2829 non-null   object 
 8   I131 treatment             2829 non-null   object 
 9   query hypothyroid          2829 non-null   object 
 10  query hyperthyroid         2829 non-null   object 
 11  lithium                    2829 non-null   object 
 12  goitre                     2829 non-null   object 
 13  tumor                      2829 non-null   objec

In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        942 non-null    float64
 1   sex                        901 non-null    object 
 2   on thyroxine               943 non-null    object 
 3   query on thyroxine         943 non-null    object 
 4   on antithyroid medication  943 non-null    object 
 5   sick                       943 non-null    object 
 6   pregnant                   943 non-null    object 
 7   thyroid surgery            943 non-null    object 
 8   I131 treatment             943 non-null    object 
 9   query hypothyroid          943 non-null    object 
 10  query hyperthyroid         943 non-null    object 
 11  lithium                    943 non-null    object 
 12  goitre                     943 non-null    object 
 13  tumor                      943 non-null    object 

In [24]:
import os
os.environ['LOKY_MAX_CPU_COUNT'] = '4'  # Set the number of cores you want to use
from imblearn.over_sampling import SMOTE

# Your SMOTE code here
smote = SMOTE(sampling_strategy='auto')
X_smote, y_smote = smote.fit_resample(input_feature_train_arr, target_feature_train_df)


[WinError 2] The system cannot find the file specified
  File "e:\ineuron_2023\thyroiddieaseprediction\thyroid_disease_detection\env\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "e:\ineuron_2023\thyroiddieaseprediction\thyroid_disease_detection\env\lib\subprocess.py", line 489, in run
    with Popen(*popenargs, **kwargs) as process:
  File "e:\ineuron_2023\thyroiddieaseprediction\thyroid_disease_detection\env\lib\subprocess.py", line 854, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "e:\ineuron_2023\thyroiddieaseprediction\thyroid_disease_detection\env\lib\subprocess.py", line 1307, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


In [25]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=11)

X_smote, y_smote = smote.fit_resample(input_feature_train_arr, target_feature_train_df)


In [26]:
print("Shape before the Oversampling : ",input_feature_train_arr.shape)
print("Shape after the Oversampling : ",X_smote.shape)

Shape before the Oversampling :  (2829, 16)
Shape after the Oversampling :  (5230, 16)


In [27]:
print("Shape before the Oversampling : ",target_feature_train_df.shape)
print("Shape after the Oversampling : ",y_smote.shape)

Shape before the Oversampling :  (2829, 1)
Shape after the Oversampling :  (5230, 1)


In [28]:

### combining Xtrain (x smote) and y train (y smote )

# train_arr = np.c_[X_smote, y_smote]
# train_arr  = pd.DataFrame(train_arr)

# train_arr


In [29]:
# test_arr = np.c_[input_feature_test_arr, target_feature_test_df] ## combining y and x test 

# test_arr = pd.DataFrame(test_arr)
# test_arr

In [30]:
# X_train, y_train, X_test, y_test = (
#     train_arr.iloc[:, :-1],
#     train_arr.iloc[:, -1],
#     test_arr.iloc[:, :-1],
#     test_arr.iloc[:, -1]
# )


In [31]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming x_smote, y_smote, input_feature_test_arr, target_feature_test_df are defined elsewhere in your code

# Define the models dictionary
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier()
}

# Initialize an empty list to store the results
results_list = []

# Evaluate and append metrics for each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_smote, y_smote)
    
    # Predictions
    y_pred = model.predict(input_feature_test_arr)

    # Metrics
    accuracy = accuracy_score(target_feature_test_df, y_pred) * 100
    
    # Append results to the list
    results_list.append({
        'Model': model_name,
        'Accuracy': accuracy,
    })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results_list)

# Sort the accuracy results DataFrame by the "Accuracy" column in descending order
sorted_accuracy_df = results_df.sort_values(by='Accuracy', ascending=False)

# Display the model with the maximum accuracy
max_accuracy_model = sorted_accuracy_df.iloc[0]

print(f"Model with Maximum Accuracy:\n{max_accuracy_model}")


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Model with Maximum Accuracy:
Model       Random Forest
Accuracy        99.469777
Name: 2, dtype: object


  return self._fit(X, y)
