In [4]:
import numpy as np

In [5]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [6]:
df = pd.read_csv('covid1.csv')

In [7]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [8]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop( columns = ['has_covid']),df['has_covid'],test_size = 0.2)

In [10]:
x_train

Unnamed: 0,age,gender,fever,cough,city
25,23,Male,,Mild,Mumbai
49,44,Male,104.0,Mild,Mumbai
53,83,Male,98.0,Mild,Delhi
56,71,Male,,Strong,Kolkata
82,24,Male,98.0,Mild,Kolkata
...,...,...,...,...,...
32,34,Female,101.0,Strong,Delhi
87,47,Male,101.0,Strong,Bangalore
24,13,Female,100.0,Strong,Kolkata
12,25,Female,99.0,Strong,Kolkata


In [11]:
#adding simple imputer to fever column
si = SimpleImputer(strategy = 'mean')
x_train_fever = si.fit_transform(x_train[['fever']])

#also the test data
x_test_fever = si.fit_transform(x_test[['fever']])

x_train_fever.shape


(80, 1)

In [12]:
#ordinal encoding ---> cough

oe = OrdinalEncoder(categories = [['Mild','Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])

#also the test data
x_test_cough = oe.fit_transform(x_test[['cough']])

x_train_cough.shape

(80, 1)

In [13]:
#onehotencoding ---> gender,city

ohe = OneHotEncoder(drop = 'first',sparse_output = False)
x_train_gender_city = ohe.fit_transform(x_train [['gender','city']])

#also the test data
x_test_gender_city = ohe.fit_transform(x_test [['gender','city']])
                                       
x_train_gender_city.shape                                      
                                       

(80, 4)

In [14]:
#extracting age

x_train_age = x_train.drop(columns = ['gender','fever','cough','city']).values

#also the data
x_test_age = x_test.drop(columns = ['gender','fever','cough','city']).values

In [15]:
x_train_age.shape

(80, 1)

In [16]:
x_train_transformed = np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis = 1)

In [17]:
x_train_transformed.shape

(80, 7)

In [18]:
#from sklearn.compose import ColumnTransformer #this is how to import ColumnTransformer
#transformer = ColumnTransformer(transformers = [('tnf1',SimpleImputer(),['fever']) # in a fever column column by the help of si we fill missing values by mean median mode
#('tnf2',OrdinalEncoder(categories = [['Mild','Strong']]),['cough']), #by this process we encode our data
#('tnf3',OneHotEncoder(sparse_output = False,drop='first'),['gender','city'])],remainder='passthrough')                                                                                                

In [19]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ('tnf1', SimpleImputer(), ['fever']),  # Filling missing values in 'fever' column using SimpleImputer
        ('tnf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),  # Encoding 'cough' column using OrdinalEncoder
        ('tnf3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'])  # OneHotEncoding 'gender' and 'city' columns
    ],
    remainder='passthrough'
)

In [30]:
p = transformer.fit_transform(x_train)

In [21]:
transformer.transform(x_test).shape

(20, 7)

In [26]:
x_train


Unnamed: 0,age,gender,fever,cough,city
25,23,Male,,Mild,Mumbai
49,44,Male,104.0,Mild,Mumbai
53,83,Male,98.0,Mild,Delhi
56,71,Male,,Strong,Kolkata
82,24,Male,98.0,Mild,Kolkata
...,...,...,...,...,...
32,34,Female,101.0,Strong,Delhi
87,47,Male,101.0,Strong,Bangalore
24,13,Female,100.0,Strong,Kolkata
12,25,Female,99.0,Strong,Kolkata


In [27]:
transformer

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [31]:
p

array([[100.74285714,   0.        ,   1.        ,   0.        ,
          0.        ,   1.        ,  23.        ],
       [104.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   1.        ,  44.        ],
       [ 98.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  83.        ],
       [100.74285714,   1.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  71.        ],
       [ 98.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  24.        ],
       [103.        ,   1.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  46.        ],
       [101.        ,   0.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  42.        ],
       [104.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   0.        ,  25.        ],
       [101.        ,   0.        ,   0.        ,   0.        ,
          0.    