### Load and Explore the Dataset

In [14]:
import pandas as pd

# Load the datasets
bacteria_diseases = pd.read_csv('Bacteria diseases.csv')
fungi_diseases = pd.read_csv('Fungi Diseases.csv')
virus_diseases = pd.read_csv('Virus diseases.csv')

# Display the first few rows of each dataset
bacteria_diseases.head(), fungi_diseases.head(), virus_diseases.head()


(     English_Name Oruko(Yoruba_Name)  \
 0    Tuberculosis                iko   
 1    Strep Throat         Strep ọfun   
 2  Whooping Cough        Ikọaláìdúró   
 3         Cholera        Arun kolera   
 4    Lyme Disease          Arun Lyme   
 
                                     English_Symptoms  \
 0  Persistent cough, weight loss, night sweats, f...   
 1  Sore throat, difficulty swallowing, red and sw...   
 2  Severe coughing fits, whooping sound when inha...   
 3  Watery diarrhea, dehydration, vomiting, rapid ...   
 4  Bullseye rash, fever, chills, headache, fatigu...   
 
                            Awọn aami aisan(Symptoms)  
 0  Ikọaláìdúró àìdára, àdánù làìpẹ, lagun alẹ, ib...  
 1  Ọfun ọgbẹ, iṣoro gbigbe, pupa ati tonsils wú, ...  
 2  Ikọaláìdúró ti o lagbara, ohun gbigbo nigba mi...  
 3  Igbẹ gbuuru omi, gbigbẹ, ìgbagbogbo, oṣuwọn ok...  
 4  Ẹ̀fọ́rí, ìbànújẹ́, rírẹ̀rẹ̀gẹ̀jigẹ̀, iṣan ara ...  ,
      English_Name Oruko(Yoruba_Name)  \
 0     Candidiasis        Cand

In [19]:
# Correct the column name in the bacteria_diseases dataset
bacteria_diseases.columns = bacteria_diseases.columns.str.strip()

# Combine datasets
diseases = pd.concat([
    bacteria_diseases[['English_Name', 'Oruko(Yoruba_Name)', 'English_Symptoms', 'Awọn aami aisan(Symptoms)']],
    fungi_diseases[['English_Name', 'Oruko(Yoruba_Name)', 'English_Symptoms', 'Awọn aami aisan(Symptoms)']],
    virus_diseases[['English_Name', 'Oruko(Yoruba_Name)', 'English_Symptoms', 'Awọn aami aisan(Symptoms)']]
])

# Remove rows with missing values
diseases = diseases.dropna().reset_index(drop=True)

diseases.tail()

Unnamed: 0,English_Name,Oruko(Yoruba_Name),English_Symptoms,Awọn aami aisan(Symptoms)
575,Sapovirus,Sapovirus,"Acute gastroenteritis: diarrhea, vomiting, fev...","Gastroenteritis nla: gbuuru, ìgbagbogbo, iba, ..."
576,SARS-CoV,SARS-CoV,"Severe acute respiratory syndrome: fever, coug...","Arun atẹgun nla: iba, Ikọaláìdúró, dyspnea, pn..."
577,Smallpox (Variola virus),Smallpox (Variola virus),"Fever, malaise, rash progressing to pustules, ...","Iba, ailera, sisu lilọsiwaju si pustules, scab..."
578,Torovirus,Torovirus,"Gastroenteritis: diarrhea, vomiting, fever","Gastroenteritis: igbe gbuuru, ìgbagbogbo, iba"
579,Vaccinia virus,fáírọ́ọ̀sì Vaccinia,"Localized skin lesions, fever, lymphadenopathy...","Awọn egbo awọ ara ti agbegbe, iba, lymphadenop..."


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Prepare data for training
X = diseases['English_Symptoms']
y = diseases['English_Name']

# Create a pipeline with TF-IDF Vectorizer and Logistic Regression
model_eng = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
model_eng.fit(X, y)

# Example usage
example_symptoms = "Skin lesions, fever, night sweats, weight loss, swollen lymph nodes"
predicted_disease = model_eng.predict([example_symptoms])[0]
predicted_disease


'Human Herpesvirus 8 (HHV-8)'

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Custom stopword list for Yoruba
yoruba_stopwords = set([
    'ni', 'ati', 'si', 'fun', 'lati', 'ba', 'ko', 'ti', 'o', 'wa', 'mo', 'mi', 'se', 'gbogbo', 'awon', 'naa', 'yan', 'yii', 'won', 'won ni', 'won si', 'won fun', 'won lati', 'won ba', 'won ko', 'won ti', 'won o', 'won wa', 'won mo', 'won mi', 'won se', 'won gbogbo', 'won awon', 'won naa', 'won yan', 'won yii'
])

# Preprocess text: remove stop words and tokenize
def preprocess_text(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in yoruba_stopwords]
    return ' '.join(filtered_words)

# Preprocess Yoruba symptoms
diseases['Awọn aami aisan(Symptoms)'] = diseases['Awọn aami aisan(Symptoms)'].apply(preprocess_text)

# Prepare data for training
X_yoruba = diseases['Awọn aami aisan(Symptoms)']
y_yoruba = diseases['Oruko(Yoruba_Name)']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_yoruba, y_yoruba, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF Vectorizer and Random Forest
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=75, random_state=42, max_depth=10, warm_start=True))

# Grid Search with minimal parameter tuning
param_grid = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
}

# Fewer cross-validation splits for faster performance
grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Example usage
example_symptoms_yor = "Mo ni Ẹ̀fọ́rí, ìbànújẹ́, rírẹ̀rẹ̀gẹ̀jigẹ̀, iṣan ara àti ìrora ìsokọ́ra, àwọn ọ̀rá tí ó wú"
predicted_disease_yor = grid_search.predict([preprocess_text(example_symptoms_yor)])[0]
print(predicted_disease_yor)




Arun Lyme
