In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
file_path = '/content/drive/MyDrive/GenEd-official/Applied ML/14-July-Applied-ML/covid_toy.csv'
df = pd.read_csv(file_path)

# Preview the dataset
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [3]:
df.isnull().sum()


Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [4]:
df['fever'] = df['fever'].fillna(df['fever'].mean())
# Repeat for others if needed


In [5]:
df.isnull().sum()


Unnamed: 0,0
age,0
gender,0
fever,0
cough,0
city,0
has_covid,0


In [8]:
from sklearn.preprocessing import LabelEncoder

# Label encode target variable
label_encoder = LabelEncoder()
df['has_covid'] = label_encoder.fit_transform(df['has_covid'])  # No=0, Yes=1

# One-hot encode gender, cough, city
df_encoded = pd.get_dummies(df, columns=['gender', 'cough', 'city'], drop_first=True)

In [9]:
X = df_encoded.drop('has_covid', axis=1)
y = df_encoded['has_covid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
scaler = StandardScaler()
X_train[['age', 'fever']] = scaler.fit_transform(X_train[['age', 'fever']])
X_test[['age', 'fever']] = scaler.transform(X_test[['age', 'fever']])


In [11]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.5


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Original dataset again
df = pd.read_csv('/content/drive/MyDrive/GenEd-official/Applied ML/14-July-Applied-ML/covid_toy.csv')

# Target encoding
df['has_covid'] = label_encoder.fit_transform(df['has_covid'])

# Split features and target
X = df.drop('has_covid', axis=1)
y = df['has_covid']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define columns
numeric_features = ['age', 'fever']
categorical_features = ['gender', 'cough', 'city']

# Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)
print("Pipeline Accuracy:", accuracy_score(y_test, y_pred))


Pipeline Accuracy: 0.45


In [15]:
from sklearn import set_config
set_config(display='diagram')

In [17]:
pipeline.named_steps

{'preprocessor': ColumnTransformer(transformers=[('num', StandardScaler(), ['age', 'fever']),
                                 ('cat',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore'),
                                  ['gender', 'cough', 'city'])]),
 'classifier': RandomForestClassifier(random_state=42)}

In [18]:
import pickle

In [19]:
with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)