In [14]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 📌 Step 2: Load Dataset
df = pd.read_csv('../Data/your_dataset.csv')
print("✅ Data Loaded")
display(df.head())

# 📌 Step 3: Check Basic Info
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

# Remove duplicates if any
df = df.drop_duplicates()

# 📌 Step 4: Identify Categorical Columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical Columns:", categorical_cols)

# 📌 Step 5: Apply Label Encoding to ALL Categorical Columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# 📌 Step 6: Verify All Columns are Numeric Now
print("\nData Types After Encoding:\n", df.dtypes)

# 📌 Step 7: Select Features and Target (if needed, you can skip this if already clean)
# Keeping essential columns only
df = df[['Week', 'State_Code', 'State_Name', 'Disease_Code', 'Incidence_per_Capita', 'Disease_Outbreak']]

# 📌 Step 8: Save Preprocessed Data
df.to_csv('../Data/preprocessed_data.csv', index=False)
print("\n✅ Preprocessed data saved successfully to '../Data/preprocessed_data.csv'")


✅ Data Loaded


Unnamed: 0,Week,State_Code,State_Name,Disease_Code,Incidence_per_Capita,Disease_Outbreak
0,8,UP,Maharashtra,DEN,0.36,0
1,38,KA,Rajasthan,DEN,2.5,1
2,16,MH,Gujarat,CHK,0.31,0
3,17,KA,Karnataka,MAL,4.46,1
4,31,KA,West Bengal,CHK,0.52,0



Missing Values:
 Week                    0
State_Code              0
State_Name              0
Disease_Code            0
Incidence_per_Capita    0
Disease_Outbreak        0
dtype: int64

Duplicates: 0

Categorical Columns: ['State_Code', 'State_Name', 'Disease_Code']

Data Types After Encoding:
 Week                      int64
State_Code                int64
State_Name                int64
Disease_Code              int64
Incidence_per_Capita    float64
Disease_Outbreak          int64
dtype: object

✅ Preprocessed data saved successfully to '../Data/preprocessed_data.csv'
