In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load the symptom CSV
symptom_data = pd.read_csv('symptoms.csv')

# Example list of symptoms input by a user
user_symptoms = [
    "headache", "fever", "nausea"
]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the symptoms from CSV
vectorizer.fit(symptom_data['Symptom'])

# Convert user symptoms into feature vectors
user_symptom_vector = vectorizer.transform(user_symptoms)

# Save the vectorizer to a file for later use
joblib.dump(vectorizer, 'symptom_vectorizer.pkl')

# Now you can use this vectorizer to process user input and make predictions
print(user_symptom_vector.toarray())  # This prints the transformed feature vector for user input symptoms


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.

In [3]:
import re
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Load pre-trained model, label encoder, and doctor data
model = joblib.load('./disease_predictor_model.pkl')
label_encoder = joblib.load('./label_encoder.pkl')
doctor_data = pd.read_csv('./diseases_and_doctors.csv')

# Load symptom vectorizer trained with TF-IDF
vectorizer = joblib.load('./symptom_vectorizer.pkl')  # Ensure this is the correct path

# Function to preprocess input symptoms
def preprocess_input(text):
    text = text.lower()  # Convert to lowercase for consistency
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters (punctuation, etc.)
    return text

# Function to extract symptoms and predict disease
def predict_disease_from_text(input_text):
    # Preprocess the input symptoms text
    preprocessed_text = preprocess_input(input_text)
    
    # Vectorize the preprocessed input text (transforming to a vector representation)
    symptoms_vector = vectorizer.transform([preprocessed_text])

    # Predict the disease based on the vectorized symptoms
    disease_prediction = model.predict(symptoms_vector)
    predicted_disease = label_encoder.inverse_transform(disease_prediction)[0]  # Convert numeric label to disease name

    # Find the suitable doctor for the predicted disease
    doctor_row = doctor_data[doctor_data['Disease'].str.lower() == predicted_disease.lower()]
    suitable_doctor = doctor_row['Suitable_Doctor'].values[0] if not doctor_row.empty else "No suitable doctor found"
    
    return predicted_disease, suitable_doctor

# Example user input and prediction
user_input = "I have a severe headache and feeling nauseous"  # Sample user input
predicted_disease, suitable_doctor = predict_disease_from_text(user_input)

print(f"Predicted Disease: {predicted_disease}")
print(f"Suitable Doctor: {suitable_doctor}")




ValueError: X has 133 features, but RandomForestClassifier is expecting 132 features as input.