## Importing Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

## Load the Dataset

In [3]:
file_path = 'nipah_dataset.csv'
nipah_data = pd.read_csv(file_path)

## Display the first few rows of the dataset

In [4]:
nipah_data.head()

Unnamed: 0,Age,Gender,Location,Contact_with_Infected_Animals,Fever,Respiratory_Symptoms,Neurological_Symptoms,Travel_History,Vaccination_Status,Occupation,Underlying_Health_Conditions,Blood_Test_Results,Infected
0,52,Male,Urban,1,1,0,1,0,0,0,0,0,1
1,15,Male,Urban,0,0,0,1,0,0,0,0,0,0
2,72,Female,Urban,0,0,1,0,0,1,1,0,0,1
3,61,Male,Rural,0,0,1,0,1,1,1,0,0,1
4,21,Male,Rural,0,0,0,0,0,1,0,0,0,0


In [5]:
nipah_data.tail()

Unnamed: 0,Age,Gender,Location,Contact_with_Infected_Animals,Fever,Respiratory_Symptoms,Neurological_Symptoms,Travel_History,Vaccination_Status,Occupation,Underlying_Health_Conditions,Blood_Test_Results,Infected
995,54,Female,Rural,0,0,0,0,0,0,0,0,0,0
996,57,Female,Rural,0,0,1,1,0,0,0,0,1,0
997,1,Female,Urban,0,0,0,0,0,0,0,0,0,0
998,63,Female,Rural,0,0,0,0,0,0,0,0,1,0
999,54,Male,Rural,0,0,0,0,0,0,0,1,0,0


In [6]:
nipah_data.sample(10)

Unnamed: 0,Age,Gender,Location,Contact_with_Infected_Animals,Fever,Respiratory_Symptoms,Neurological_Symptoms,Travel_History,Vaccination_Status,Occupation,Underlying_Health_Conditions,Blood_Test_Results,Infected
409,69,Male,Urban,0,0,0,0,0,1,0,1,0,0
127,15,Male,Rural,0,0,0,0,0,0,1,1,0,0
965,74,Female,Urban,0,0,0,0,0,0,0,0,1,1
967,6,Male,Urban,1,0,0,0,0,1,0,0,1,1
621,49,Male,Urban,0,0,0,0,0,0,0,0,0,0
838,14,Male,Urban,0,0,1,0,0,1,0,0,0,0
29,47,Female,Rural,1,0,0,0,1,1,0,0,0,0
643,17,Female,Urban,0,0,0,1,0,0,1,0,0,0
96,28,Female,Rural,1,0,0,0,1,1,0,0,1,1
119,15,Male,Rural,0,1,0,0,0,1,0,0,0,0


## Display the information

In [7]:
nipah_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Age                            1000 non-null   int64 
 1   Gender                         1000 non-null   object
 2   Location                       1000 non-null   object
 3   Contact_with_Infected_Animals  1000 non-null   int64 
 4   Fever                          1000 non-null   int64 
 5   Respiratory_Symptoms           1000 non-null   int64 
 6   Neurological_Symptoms          1000 non-null   int64 
 7   Travel_History                 1000 non-null   int64 
 8   Vaccination_Status             1000 non-null   int64 
 9   Occupation                     1000 non-null   int64 
 10  Underlying_Health_Conditions   1000 non-null   int64 
 11  Blood_Test_Results             1000 non-null   int64 
 12  Infected                       1000 non-null   int64 
dtypes: i

## Display basic statistics of the dataset

In [8]:
nipah_data.describe()

Unnamed: 0,Age,Contact_with_Infected_Animals,Fever,Respiratory_Symptoms,Neurological_Symptoms,Travel_History,Vaccination_Status,Occupation,Underlying_Health_Conditions,Blood_Test_Results,Infected
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,38.809,0.145,0.297,0.197,0.111,0.146,0.269,0.083,0.136,0.258,0.195
std,22.86962,0.352277,0.457165,0.397931,0.314289,0.353283,0.443662,0.27602,0.34296,0.437753,0.396399
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,59.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,79.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Handle missing values

In [9]:
nipah_data = nipah_data.dropna()

## Encode categorical variables

In [10]:
label_encoder = LabelEncoder()
for column in nipah_data.select_dtypes(include=['object']).columns:
    nipah_data[column] = label_encoder.fit_transform(nipah_data[column])

## Define features and target variable

In [11]:
X = nipah_data.drop('Infected', axis=1)
y = nipah_data['Infected']

## Split the data into training and testing sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize the model

In [13]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

## Train the model

In [14]:
model.fit(X_train, y_train)

## Make predictions

In [15]:
y_pred = model.predict(X_test)

## Evaluate the model


In [16]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 77.00%


## Classification report

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       161
           1       0.36      0.23      0.28        39

    accuracy                           0.77       200
   macro avg       0.59      0.57      0.57       200
weighted avg       0.74      0.77      0.75       200



## Confusion matrix

In [18]:
print(confusion_matrix(y_test, y_pred))

[[145  16]
 [ 30   9]]


In [19]:
nipah_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   Age                            1000 non-null   int64
 1   Gender                         1000 non-null   int64
 2   Location                       1000 non-null   int64
 3   Contact_with_Infected_Animals  1000 non-null   int64
 4   Fever                          1000 non-null   int64
 5   Respiratory_Symptoms           1000 non-null   int64
 6   Neurological_Symptoms          1000 non-null   int64
 7   Travel_History                 1000 non-null   int64
 8   Vaccination_Status             1000 non-null   int64
 9   Occupation                     1000 non-null   int64
 10  Underlying_Health_Conditions   1000 non-null   int64
 11  Blood_Test_Results             1000 non-null   int64
 12  Infected                       1000 non-null   int64
dtypes: int64(13)
memory

## Encode new data

In [20]:
new_data = pd.DataFrame({
    'Age': [25, 60],
    'Gender': ['Male', 'Female'],
    'Location': ['1', '0'],
    'Contact_with_Infected_Animals': ['1', '0'],
    'Fever': ['1', '0'],
    'Respiratory_Symptoms': ['1', '0'],
    'Neurological_Symptoms': ['1', '0'],
    'Vaccination_Status': ['1', '0'],
    'Occupation': ['1', '0'],
    'Underlying_Health_Conditions': ['1', '0'],
    'Blood_Test_Results': ['1', '0']
})

## Prepare LabelEncoders for categorical columns

In [21]:
train_data = pd.DataFrame({
    'Age': [23, 45, 34, 50, 29],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Location': ['0', '1', '1', '0', '1'],
    'Contact_with_Infected_Animals': ['0', '1', '0', '1', '1'],
    'Fever': ['1', '0', '1', '0', '1'],
    'Respiratory_Symptoms': ['0', '1', '1', '0', '0'],
    'Neurological_Symptoms': ['1', '0', '1', '1', '0'],
    'Vaccination_Status': ['0', '1', '0', '1', '1'],
    'Occupation': ['0', '1', '0', '1', '1'],
    'Underlying_Health_Conditions': ['1', '0', '1', '0', '1'],
    'Blood_Test_Results': ['0', '1', '0', '1', '1'],
    'Infected': [0, 1, 0, 1, 0]  # Target variable
})

label_encoders = {}
for column in train_data.select_dtypes(include=['object']).columns:
    if column != 'Infected':  # Skip the target column
        le = LabelEncoder()
        le.fit(train_data[column])
        label_encoders[column] = le

new_data_encoded = new_data.copy()
for column in new_data.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        new_data_encoded[column] = label_encoders[column].transform(new_data[column])
    else:
        raise ValueError(f"No label encoder found for column: {column}")

## Encode training data

In [22]:
encoded_train_data = train_data.copy()
for column in label_encoders:
    encoded_train_data[column] = label_encoders[column].transform(train_data[column])

In [23]:
X = encoded_train_data.drop('Infected', axis=1)
y = encoded_train_data['Infected']

In [24]:
# Train a model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.80


## Get user input and encode it

In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Sample training data


# Prepare LabelEncoders for categorical columns
label_encoders = {}
for column in train_data.select_dtypes(include=['object']).columns:
    if column != 'Infected':  # Skip the target column
        le = LabelEncoder()
        le.fit(train_data[column])
        label_encoders[column] = le

# Encode training data
encoded_train_data = train_data.copy()
for column in label_encoders:
    encoded_train_data[column] = label_encoders[column].transform(train_data[column])

# Separate features and target
X = encoded_train_data.drop('Infected', axis=1)
y = encoded_train_data['Infected']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

def get_user_input():
    age = int(input("Enter Age: "))
    gender = input("Enter Gender (Male/Female): ")
    location = input("Enter Location (0/1): ")
    contact = input("Contact with Infected Animals (0/1): ")
    fever = input("Fever (0/1): ")
    respiratory = input("Respiratory Symptoms (0/1): ")
    neurological = input("Neurological Symptoms (0/1): ")
    vaccination = input("Vaccination Status (0/1): ")
    occupation = input("Occupation (0/1): ")
    conditions = input("Underlying Health Conditions (0/1): ")
    blood_test = input("Blood Test Results (0/1): ")

    return pd.DataFrame({
        'Age': [age],
        'Gender': [gender],
        'Location': [location],
        'Contact_with_Infected_Animals': [contact],
        'Fever': [fever],
        'Respiratory_Symptoms': [respiratory],
        'Neurological_Symptoms': [neurological],
        'Vaccination_Status': [vaccination],
        'Occupation': [occupation],
        'Underlying_Health_Conditions': [conditions],
        'Blood_Test_Results': [blood_test]
    })

# Get user input and encode it
user_data = get_user_input()
encoded_user_data = user_data.copy()

# Handle unseen labels
for column in label_encoders:
    if column in encoded_user_data:
        # Check if the labels are valid
        try:
            encoded_user_data[column] = label_encoders[column].transform(user_data[column])
        except ValueError as e:
            print(f"Error: {e}. The label '{user_data[column][0]}' is not recognized for column '{column}'.")
            print(f"Assigning a default value for column '{column}'.")
            # Handle unseen labels by assigning a default value, e.g., the first label
            encoded_user_data[column] = label_encoders[column].classes_[1]

# Predict using the trained model
predictions = model.predict(encoded_user_data)
print("Predictions:", predictions)

Accuracy: 1.00


Enter Age:  63
Enter Gender (Male/Female):  Male
Enter Location (0/1):  1
Contact with Infected Animals (0/1):  1
Fever (0/1):  1
Respiratory Symptoms (0/1):  1
Neurological Symptoms (0/1):  1
Vaccination Status (0/1):  1
Occupation (0/1):  1
Underlying Health Conditions (0/1):  1
Blood Test Results (0/1):  1


Predictions: [1]
