## Step 1: Import Libraries

We begin by importing the essential libraries for data loading, processing, and visualization.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


## Step 2: Load the Dataset

We load the raw weather-related disease dataset from the `data/raw/` directory.
Make sure the CSV file is placed there after downloading from Kaggle.

In [3]:
df = pd.read_csv('../data/raw/Weather-related disease prediction.csv')
df.head()

Unnamed: 0,Age,Gender,Temperature (C),Humidity,Wind Speed (km/h),nausea,joint_pain,abdominal_pain,high_fever,chills,...,facial_pain,shortness_of_breath,reduced_smell_and_taste,skin_irritation,itchiness,throbbing_headache,confusion,back_pain,knee_ache,prognosis
0,4,1,25.826,0.74,8.289,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Heart Attack
1,55,0,21.628,0.6,15.236,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,Influenza
2,45,0,13.8,0.817083,4.291992,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Influenza
3,6,0,37.254,0.61,18.009,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Dengue
4,70,0,18.162,0.87,17.916,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Sinusitis


## Step 3: Inspect the Dataset

We check the structure of the data including:
- Number of rows and columns
- Column names and data types
- Missing values

In [None]:
print("The shape of the dataset is: ", df.shape)
print("-" * 50)
print("The columns in the dataset are: ", df.columns)
print("The data types of the columns are: ", df.dtypes)
print("The summary statistics of the dataset are: ", df.describe())
print("The number of missing values in each column are: ", df.isnull().sum())
print("The number of unique values in each column are: ", df.nunique())
print("The number of duplicate rows in the dataset are: ", df.duplicated().sum())
df.info()

The shape of the dataset is:  (5200, 51)
The columns in the dataset are:  Index(['Age', 'Gender', 'Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'nausea', 'joint_pain', 'abdominal_pain', 'high_fever', 'chills',
       'fatigue', 'runny_nose', 'pain_behind_the_eyes', 'dizziness',
       'headache', 'chest_pain', 'vomiting', 'cough', 'shivering',
       'asthma_history', 'high_cholesterol', 'diabetes', 'obesity', 'hiv_aids',
       'nasal_polyps', 'asthma', 'high_blood_pressure', 'severe_headache',
       'weakness', 'trouble_seeing', 'fever', 'body_aches', 'sore_throat',
       'sneezing', 'diarrhea', 'rapid_breathing', 'rapid_heart_rate',
       'pain_behind_eyes', 'swollen_glands', 'rashes', 'sinus_headache',
       'facial_pain', 'shortness_of_breath', 'reduced_smell_and_taste',
       'skin_irritation', 'itchiness', 'throbbing_headache', 'confusion',
       'back_pain', 'knee_ache', 'prognosis'],
      dtype='object')
The data types of the columns are:  Age              

In [10]:
df['prognosis'].value_counts()

prognosis
Heart Attack    968
Migraine        897
Influenza       632
Heat Stroke     323
Malaria         319
Stroke          312
Eczema          311
Common Cold     309
Dengue          308
Sinusitis       301
Arthritis       301
Name: count, dtype: int64

## Step 6: Clean and Prepare Dataset

We now clean the dataset by:
- Removing duplicate rows
- Verifying column name consistency
- Saving the preprocessed dataset for modeling

In [6]:
# Remove duplicate rows
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicates after:", df.duplicated().sum())

# Check if any columns are duplicates (e.g., pain_behind_eyes vs pain_behind_the_eyes)
columns = df.columns
dupes = [col for col in columns if "pain_behind" in col]
print("Potential duplicates:", dupes)


Duplicates before: 219
Duplicates after: 0
Potential duplicates: ['pain_behind_the_eyes', 'pain_behind_eyes']


In [9]:
# Step 1: Check if both columns exist
if 'pain_behind_the_eyes' in df.columns and 'pain_behind_eyes' in df.columns:
    print("Both columns exist.\n")
    
    # Step 2: Compare value counts
    print("Value counts - pain_behind_the_eyes:\n", df['pain_behind_the_eyes'].value_counts())
    print("\nValue counts - pain_behind_eyes:\n", df['pain_behind_eyes'].value_counts())

    # Step 3: Check if they are identical
    identical = (df['pain_behind_the_eyes'] == df['pain_behind_eyes']).all()
    print("\nAre the columns identical?", identical)
    
    # Step 4: If identical, drop one
    if identical:
        df = df.drop(columns=['pain_behind_eyes'])
        print("\nDropped 'pain_behind_eyes' as it's a duplicate.")
else:
    print("One or both columns are missing.")

# df[['pain_behind_the_eyes', 'pain_behind_eyes']].head(20)


Both columns exist.

Value counts - pain_behind_the_eyes:
 pain_behind_the_eyes
0    4481
1     500
Name: count, dtype: int64

Value counts - pain_behind_eyes:
 pain_behind_eyes
0    4951
1      30
Name: count, dtype: int64

Are the columns identical? False


In [11]:
#As pain_behind_eyes and pain_behind_the_eyes are identical, we can drop one of them
#We choose to drop pain_behind_eyes, because it has less signal and less positive values
df.drop(columns=['pain_behind_eyes'], inplace=True)

## Step 8: Normalize Continuous Features

To ensure consistency across models, we normalize the continuous features:  
- `age`, `temperature_c`, `humidity`, and `wind_speed_km_h`  
We use **Min-Max Scaling** to scale values between 0 and 1.


In [14]:
print(df.columns.tolist())


from sklearn.preprocessing import MinMaxScaler

# Define continuous columns to scale
scale_cols = ['Age', 'Temperature (C)', 'Humidity', 'Wind Speed (km/h)']

# Initialize scaler
scaler = MinMaxScaler()

# Apply scaling
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Confirm results
df[scale_cols].describe()


['Age', 'Gender', 'Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'nausea', 'joint_pain', 'abdominal_pain', 'high_fever', 'chills', 'fatigue', 'runny_nose', 'pain_behind_the_eyes', 'dizziness', 'headache', 'chest_pain', 'vomiting', 'cough', 'shivering', 'asthma_history', 'high_cholesterol', 'diabetes', 'obesity', 'hiv_aids', 'nasal_polyps', 'asthma', 'high_blood_pressure', 'severe_headache', 'weakness', 'trouble_seeing', 'fever', 'body_aches', 'sore_throat', 'sneezing', 'diarrhea', 'rapid_breathing', 'rapid_heart_rate', 'swollen_glands', 'rashes', 'sinus_headache', 'facial_pain', 'shortness_of_breath', 'reduced_smell_and_taste', 'skin_irritation', 'itchiness', 'throbbing_headache', 'confusion', 'back_pain', 'knee_ache', 'prognosis']


Unnamed: 0,Age,Temperature (C),Humidity,Wind Speed (km/h)
count,4981.0,4981.0,4981.0,4981.0
mean,0.507989,0.625219,0.594821,0.341843
std,0.268938,0.203833,0.227381,0.181261
min,0.0,0.0,0.0,0.0
25%,0.282828,0.487113,0.402649,0.204734
50%,0.525253,0.627336,0.602649,0.325225
75%,0.737374,0.778051,0.777483,0.481034
max,1.0,1.0,1.0,1.0
