In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv(r"C:\Users\KIIT\Desktop\100 Days of ML\Datasets\covid_toy.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [6]:
X_train

Unnamed: 0,age,gender,fever,cough,city
90,59,Female,99.0,Strong,Delhi
16,69,Female,103.0,Mild,Kolkata
14,51,Male,104.0,Mild,Bangalore
72,83,Female,101.0,Mild,Kolkata
22,71,Female,98.0,Strong,Kolkata
...,...,...,...,...,...
71,75,Female,104.0,Strong,Delhi
4,65,Female,101.0,Mild,Mumbai
97,20,Female,101.0,Mild,Bangalore
21,73,Male,98.0,Mild,Bangalore


In [7]:
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])
X_test_fever = si.transform(X_test[['fever']])
X_train_fever.shape

(80, 1)

In [8]:
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.transform(X_test[['cough']])
X_train_cough.shape

(80, 1)

In [9]:
ohe = OneHotEncoder(drop='first')
X_train_gender_city_ = ohe.fit_transform(X_train[['gender','city']])
X_test_gender_city_ = ohe.transform(X_test[['gender','city']])

In [10]:
X_train_gender_city_.shape

(80, 4)

In [11]:
X_train_age = X_train[['age']]
X_test_age = X_test[['age']]
X_train_age.shape

(80, 1)

In [12]:
X_train_age.shape

(80, 1)

In [None]:
X_train_transformed = np.concatenate((np.array(X_train_age),(X_train_gender_city_).toarray(),X_train_fever,X_train_cough),axis=1)
X_test_transformed = np.concatenate((X_test_age.values,X_test_gender_city_.toarray(),X_test_fever,X_test_cough),axis=1)

# IMPORTANT NOTE: A_train_age is of pd.DataFrame type and X_train_gender_city_ is of scipy sparse matrix type. 
# Since they are of data types different from array type and different to each other too, they are not compatible for concatenation.
# Hence they are converted into array types. We can either use np.array(input) for any type or, "input.values" for df type and "input.toarray()" for sparse type.
# below(2 cells) is the ellaboration of the points mentioned above.



In [23]:
arrays_to_concat = [X_train_age, X_train_gender_city_, X_train_fever, X_train_cough]

for arr in arrays_to_concat:
    print(arr.shape)  # Check each one is (80, something)

X_train_transformed = np.concatenate(arrays_to_concat, axis=1)


(80, 1)
(80, 4)
(80, 1)
(80, 1)


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 0 dimension(s)

In [24]:
for i,arr in enumerate(arrays_to_concat):
    print(f"Array {i+1}: type = {type(arr)}, shape = {arr.shape} ")

Array 1: type = <class 'pandas.core.frame.DataFrame'>, shape = (80, 1) 
Array 2: type = <class 'scipy.sparse._csr.csr_matrix'>, shape = (80, 4) 
Array 3: type = <class 'numpy.ndarray'>, shape = (80, 1) 
Array 4: type = <class 'numpy.ndarray'>, shape = (80, 1) 


In [25]:
from sklearn.compose import ColumnTransformer

In [None]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),  
    ('tnf2', OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(drop='first'),['gender','city'])
],remainder='passthrough')

In [32]:
transformer.fit_transform(X_train).shape

(80, 7)

In [35]:
transformer.transform(X_test)

array([[104.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  16.        ],
       [102.        ,   0.        ,   1.        ,   0.        ,
          0.        ,   1.        ,  74.        ],
       [101.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  51.        ],
       [ 98.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  64.        ],
       [104.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,  12.        ],
       [102.        ,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,   5.        ],
       [ 99.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   1.        ,  60.        ],
       [ 98.        ,   1.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  40.        ],
       [ 98.        ,   0.        ,   1.        ,   1.        ,
          0.    