In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('datasets/disease.csv')
df.head()

Unnamed: 0,gender,city,age,bp,cough,disease
0,Male,Lahore,60.0,low,Moderate,Yes
1,Male,Islamabad,27.0,low,Mild,No
2,Male,Islamabad,,normal,Strong,No
3,Female,Lahore,31.0,high,Moderate,Yes
4,Female,Karachi,65.0,high,Mild,No


In [26]:
df = df[['gender', 'city', 'bp', 'cough', 'age', 'disease']]
df.head()

Unnamed: 0,gender,city,bp,cough,age,disease
0,Male,Lahore,low,Moderate,60.0,Yes
1,Male,Islamabad,low,Mild,27.0,No
2,Male,Islamabad,normal,Strong,,No
3,Female,Lahore,high,Moderate,31.0,Yes
4,Female,Karachi,high,Mild,65.0,No


In [27]:
y = df['disease']
X = df.drop('disease', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 54)
len(X_train), len(X_test), len(y_train), len(y_test)

(80, 20, 80, 20)

In [29]:
X_train.head()

Unnamed: 0,gender,city,bp,cough,age
90,Female,Islamabad,low,Moderate,59.0
31,Male,Lahore,normal,Mild,
58,Male,Karachi,low,Strong,23.0
74,Female,Islamabad,high,Strong,34.0
89,Male,,low,Strong,46.0


In [30]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 90 to 69
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   gender  79 non-null     object 
 1   city    78 non-null     object 
 2   bp      80 non-null     object 
 3   cough   80 non-null     object 
 4   age     69 non-null     float64
dtypes: float64(1), object(4)
memory usage: 3.8+ KB


In [31]:
si0 = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
si1 = SimpleImputer(missing_values = np.nan, strategy = 'mean')
ohe = OneHotEncoder(drop = 'first', sparse_output = False, dtype = np.int32)
oe = OrdinalEncoder(categories = [['low', 'normal', 'high'],['Mild', 'Moderate', 'Strong']], dtype = np.int16)

In [32]:
transformer = ColumnTransformer(transformers = 
                               [   
                                 ('SiGC', si0, ['gender', 'city']),
                                 ('SiAge', si1, ['age']),
                                 ('ohe', ohe, )
                               ],
                               remainder = 'passthrough')
type(transformer)

sklearn.compose._column_transformer.ColumnTransformer

In [33]:
transformer.fit(X_train)
transformed_trained = transformer.transform(X_train)
transformed_tested = transformer.transform(X_test)

In [34]:
X_trained = pd.DataFrame(data = transformed_trained, columns = X_train.columns)
X_trained.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  80 non-null     object
 1   city    80 non-null     object
 2   bp      80 non-null     object
 3   cough   80 non-null     object
 4   age     80 non-null     object
dtypes: object(5)
memory usage: 3.3+ KB


In [35]:
X_trained.head()

Unnamed: 0,gender,city,bp,cough,age
0,Female,Islamabad,59.0,low,Moderate
1,Male,Lahore,41.942029,normal,Mild
2,Male,Karachi,23.0,low,Strong
3,Female,Islamabad,34.0,high,Strong
4,Male,Lahore,46.0,low,Strong


In [None]:
SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
siA = SimpleImputer(missing_values = np.nan, strategy = 'mean')
ohe = OneHotEncoder(drop = 'first', sparse_output = False, dtype = np.int32)
oe = OrdinalEncoder(categories = [['low', 'normal', 'high'],['Mild', 'Moderate', 'Strong']], dtype = np.int16)