In [20]:
import pandas as pd 
import numpy as np
#modelling libraries 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  accuracy_score,precision_score,recall_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [2]:
# from xgboost import XgboostClassifier 


In [3]:
df = pd.read_csv('Symptom2Disease.csv',index_col= False)
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [4]:
df.duplicated().sum()

0

Drop the unnecessary column from the dataframe ie 'Unnamed: 0'

In [5]:
df = df.drop(columns = ["Unnamed: 0"])
df.columns

Index(['label', 'text'], dtype='object')

In [6]:


def df_report(df):
    """
    Generate a quick report of the dataframe including:
    - Shape
    - Missing values
    - Duplicate rows
    - Summary statistics
    """
    
    print("📊 DataFrame Report")
    print("-" * 40)
    
    #Basic Information
    print('This is the basic information of the dataset \n')
    print(df.info())
    
    # Shape
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")
    
    # Missing values
    print("🔍 Missing Values (per column):")
    print(df.isnull().sum(), "\n")
    
    # Duplicates
    print(f"📝 Duplicate Rows: {df.duplicated().sum()} \n")
    
    # Summary statistics
    print("📈 Summary Statistics:")
    print(df.describe(include="all").transpose())


In [7]:
df_report(df)

📊 DataFrame Report
----------------------------------------
This is the basic information of the dataset 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1200 non-null   object
 1   text    1200 non-null   object
dtypes: object(2)
memory usage: 18.9+ KB
None
Shape: 1200 rows × 2 columns

🔍 Missing Values (per column):
label    0
text     0
dtype: int64 

📝 Duplicate Rows: 47 

📈 Summary Statistics:
      count unique                                                top freq
label  1200     24                                          Psoriasis   50
text   1200   1153  I've been feeling extremely scratchy, sick, an...    4


In [8]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def generate_text_features(self, df, text_column):
        """
        Adds character count, word count, and sentence count columns.
        """
        df.loc[:, 'char_count'] = df[text_column].astype(str).apply(len)
        df.loc[:, 'word_count'] = df[text_column].astype(str).apply(lambda x: len(x.split()))
        df.loc[:, 'sentence_count'] = df[text_column].astype(str).apply(lambda x: x.count('.') + 1)
        return df

    def clean_text(self, text):
        """
        Cleans text: lowercase, remove URLs, hashtags, emojis, punctuation, numbers, and extra spaces.
        """
        text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r"#\w+", '', text)

        # Remove emojis
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002700-\U000027BF"  # Dingbats
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE
        )
        text = emoji_pattern.sub(r'', text)

        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize(self, text):
        """
        Tokenizes and removes stopwords.
        """
        tokens = word_tokenize(text)
        return [t for t in tokens if t not in self.stop_words]

    def lemmatize(self, tokens):
        """
        Lemmatizes a list of tokens.
        """
        return [self.lemmatizer.lemmatize(t) for t in tokens]

    def preprocess(self, df, text_column):
        """
        Full preprocessing:
        - Feature engineering
        - Text cleaning
        - Tokenization
        - Lemmatization
        - Save cleaned text, tokens, lemmatized tokens, and document string (from tokens)
        """
        df = df.copy() 

        df = self.generate_text_features(df, text_column)
        df.loc[:, 'cleaned_text'] = df[text_column].astype(str).apply(self.clean_text)
        df.loc[:, 'tokenized_text'] = df['cleaned_text'].apply(self.tokenize)
        df.loc[:, 'lemmatized_text'] = df['tokenized_text'].apply(self.lemmatize)
        df.loc[:, 'document'] = df['tokenized_text'].apply(lambda x: ' '.join(x))

        return df

In [9]:
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [10]:
preprocessor = TextPreprocessor()
data = preprocessor.preprocess(df,"text")

In [11]:
data.head()

Unnamed: 0,label,text,char_count,word_count,sentence_count,cleaned_text,tokenized_text,lemmatized_text,document
0,Psoriasis,I have been experiencing a skin rash on my arm...,141,28,3,i have been experiencing a skin rash on my arm...,"[experiencing, skin, rash, arms, legs, torso, ...","[experiencing, skin, rash, arm, leg, torso, pa...",experiencing skin rash arms legs torso past we...
1,Psoriasis,"My skin has been peeling, especially on my kne...",138,23,3,my skin has been peeling especially on my knee...,"[skin, peeling, especially, knees, elbows, sca...","[skin, peeling, especially, knee, elbow, scalp...",skin peeling especially knees elbows scalp pee...
2,Psoriasis,I have been experiencing joint pain in my fing...,148,28,3,i have been experiencing joint pain in my fing...,"[experiencing, joint, pain, fingers, wrists, k...","[experiencing, joint, pain, finger, wrist, kne...",experiencing joint pain fingers wrists knees p...
3,Psoriasis,"There is a silver like dusting on my skin, esp...",164,32,3,there is a silver like dusting on my skin espe...,"[silver, like, dusting, skin, especially, lowe...","[silver, like, dusting, skin, especially, lowe...",silver like dusting skin especially lower back...
4,Psoriasis,"My nails have small dents or pits in them, and...",140,27,3,my nails have small dents or pits in them and ...,"[nails, small, dents, pits, often, feel, infla...","[nail, small, dent, pit, often, feel, inflamma...",nails small dents pits often feel inflammatory...


In [12]:
data.columns

Index(['label', 'text', 'char_count', 'word_count', 'sentence_count',
       'cleaned_text', 'tokenized_text', 'lemmatized_text', 'document'],
      dtype='object')

## Modelling 

In [13]:
le = LabelEncoder()
data["label_code"] = le.fit_transform(data['label'])

In [14]:
data.columns

Index(['label', 'text', 'char_count', 'word_count', 'sentence_count',
       'cleaned_text', 'tokenized_text', 'lemmatized_text', 'document',
       'label_code'],
      dtype='object')

In [15]:
data["label"].value_counts()

label
Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: count, dtype: int64

In [16]:
#View the target columns 
data[["label","label_code"]]

Unnamed: 0,label,label_code
0,Psoriasis,15
1,Psoriasis,15
2,Psoriasis,15
3,Psoriasis,15
4,Psoriasis,15
...,...,...
1195,diabetes,19
1196,diabetes,19
1197,diabetes,19
1198,diabetes,19


In [17]:
data1 = data.copy()

In [24]:
# data = data.drop(columns = ["label"])
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(data["document"])
y = data["label_code"]

In [26]:
X_train,y_train,X_test,y_test = train_test_split(X_vectorized,y,test_size = 0.2,random_state = 42)

In [None]:


# Encode labels
le = LabelEncoder()
data["label_code"] = le.fit_transform(data['label'])

# Vectorize documents
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(data["document"])
y = data["label_code"]

# Correct order of train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, random_state=42
)

# Train Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)  # added max_iter for convergence
lr.fit(X_train, y_train)

# Predictions
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Evaluation
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Train Accuracy: 1.0
Test Accuracy: 0.9766666666666667

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        12
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00        11
           4       0.80      1.00      0.89        12
           5       1.00      0.80      0.89        15
           6       1.00      0.93      0.96        14
           7       1.00      1.00      1.00        10
           8       1.00      1.00      1.00        14
           9       1.00      1.00      1.00        13
          10       1.00      1.00      1.00        13
          11       1.00      1.00      1.00        15
          12       1.00      1.00      1.00        14
          13       1.00      1.00      1.00        11
          14       1.00      1.00      1.00        15
          15       1.00      0.89      0.94         9
  