In [1]:
import pandas as pd

# Load training and testing data
train_data = pd.read_csv('data/Training.csv')
test_data = pd.read_csv('data/Testing.csv')

# Display the first few rows and column names
print("Training Data Head:")
print(train_data.head())
print("\nTraining Data Columns:")
print(train_data.columns)

print("\nTesting Data Head:")
print(test_data.head())
print("\nTesting Data Columns:")
print(test_data.columns)

Training Data Head:
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_peeling  s

In [3]:
# Remove 'prognosis' column from testing data
test_data = test_data.drop(columns=['prognosis'])

# Verify the changes
print("Updated Testing Data Columns:")
print(test_data.columns)

Updated Testing Data Columns:
Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=132)


In [4]:
# Check for missing values in training data
print("Missing values in Training Data:")
print(train_data.isnull().sum())

# Check for missing values in testing data
print("\nMissing values in Testing Data:")
print(test_data.isnull().sum())

Missing values in Training Data:
itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

Missing values in Testing Data:
itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
small_dents_in_nails    0
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
Length: 132, dtype: int64


In [6]:
# Check if 'prognosis' column was removed from training data
print("Training Data Columns After Removal:")
print(train_data.columns)

# Separate features and target variable
X_train = train_data.drop(columns=['prognosis'])
y_train = train_data['prognosis']

# Display the shapes to confirm separation
print("\nFeatures (X_train) shape:", X_train.shape)
print("Target (y_train) shape:", y_train.shape)

Training Data Columns After Removal:
Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)

Features (X_train) shape: (4920, 132)
Target (y_train) shape: (4920,)


In [7]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = le.fit_transform(y_train)

# Display the unique classes and their encoded values
print("\nUnique classes and their encoded values:")
print(dict(zip(le.classes_, range(len(le.classes_)))))


Unique classes and their encoded values:
{'(vertigo) Paroymsal  Positional Vertigo': 0, 'AIDS': 1, 'Acne': 2, 'Alcoholic hepatitis': 3, 'Allergy': 4, 'Arthritis': 5, 'Bronchial Asthma': 6, 'Cervical spondylosis': 7, 'Chicken pox': 8, 'Chronic cholestasis': 9, 'Common Cold': 10, 'Dengue': 11, 'Diabetes ': 12, 'Dimorphic hemmorhoids(piles)': 13, 'Drug Reaction': 14, 'Fungal infection': 15, 'GERD': 16, 'Gastroenteritis': 17, 'Heart attack': 18, 'Hepatitis B': 19, 'Hepatitis C': 20, 'Hepatitis D': 21, 'Hepatitis E': 22, 'Hypertension ': 23, 'Hyperthyroidism': 24, 'Hypoglycemia': 25, 'Hypothyroidism': 26, 'Impetigo': 27, 'Jaundice': 28, 'Malaria': 29, 'Migraine': 30, 'Osteoarthristis': 31, 'Paralysis (brain hemorrhage)': 32, 'Peptic ulcer diseae': 33, 'Pneumonia': 34, 'Psoriasis': 35, 'Tuberculosis': 36, 'Typhoid': 37, 'Urinary tract infection': 38, 'Varicose veins': 39, 'hepatitis A': 40}


In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training features
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing features using the same scaler
X_test_scaled = scaler.transform(test_data)

# Display the shapes to confirm scaling
print("\nScaled Features (X_train_scaled) shape:", X_train_scaled.shape)
print("Scaled Testing Features (X_test_scaled) shape:", X_test_scaled.shape)


Scaled Features (X_train_scaled) shape: (4920, 132)
Scaled Testing Features (X_test_scaled) shape: (42, 132)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)

# Fit the model on the training data
logistic_model.fit(X_train_scaled, y_train_encoded)

# Make predictions on the training data
y_train_pred = logistic_model.predict(X_train_scaled)

# Evaluate the model
print("Training Accuracy:", accuracy_score(y_train_encoded, y_train_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train_encoded, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train_encoded, y_train_pred, target_names=le.classes_))

Training Accuracy: 1.0

Confusion Matrix:
[[120   0   0 ...   0   0   0]
 [  0 120   0 ...   0   0   0]
 [  0   0 120 ...   0   0   0]
 ...
 [  0   0   0 ... 120   0   0]
 [  0   0   0 ...   0 120   0]
 [  0   0   0 ...   0   0 120]]

Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00       120
                                   AIDS       1.00      1.00      1.00       120
                                   Acne       1.00      1.00      1.00       120
                    Alcoholic hepatitis       1.00      1.00      1.00       120
                                Allergy       1.00      1.00      1.00       120
                              Arthritis       1.00      1.00      1.00       120
                       Bronchial Asthma       1.00      1.00      1.00       120
                   Cervical spondylosis       1.00      1.00      1.00       120
            

In [10]:
# Make predictions on the testing data
y_test_pred = logistic_model.predict(X_test_scaled)

# Convert predictions back to original labels
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Display the predictions
print("\nPredictions on Testing Data:")
print(y_test_pred_labels)


Predictions on Testing Data:
['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo' 'Impetigo']


In [12]:
# Prepare submission DataFrame
submission = pd.DataFrame({
    'Id': range(1, len(y_test_pred_labels) + 1),
    'Predicted': y_test_pred_labels
})

# Save to CSV file 
submission.to_csv('data/submission.csv', index=False)

print("Submission file created: data/submission.csv")

Submission file created: data/submission.csv
