In [1]:
import pandas as pd
import numpy as np


In [2]:
survey = pd.read_csv("SYNLAB_Surveydata.csv")

In [3]:
survey.drop(columns=['Location', 'Which_of_the_followi_Check_all_that_apply/afriglobal'], inplace=True)


In [4]:
survey['Age'] = survey['Age'].str.replace('_', '-', regex=False)
survey['How_familiar_are_you_with_SYNLAB_Nigeria'] = survey['How_familiar_are_you_with_SYNLAB_Nigeria'].str.replace('_', ' ')

In [5]:
survey['Which_Medical_Labora_o_you_prefer_and_why'] = survey['Which_Medical_Labora_o_you_prefer_and_why'].str.strip().str.title()


In [6]:
lab_columns = [col for col in survey.columns if 'synlab_nigeria' in col or 'clinix' in col or 'mecure' in col or 'clina_lancet' in col or 'afirglobal' in col]
for col in lab_columns:
    survey[col] = survey[col].fillna(0).astype(bool)

In [7]:
survey['On_a_scale_of_1_5_h_on_of_SYNLAB_Nigeria'] = survey['On_a_scale_of_1_5_h_on_of_SYNLAB_Nigeria'].str.split('__').str[0].astype(float)

In [8]:
# Example for one column
labs = ['synlab_nigeria', 'clinix', 'mecure', 'clina_lancet', 'afirglobal']
for lab in labs:
    survey[f'Heard_of_{lab}'] = survey['Which_of_the_followi_Check_all_that_apply'].str.contains(lab, na=False)

In [9]:
text_cols = ['Can_you_name_any_other_Medical', 'Do_you_have_any_othe_s_for_SYNLAB_Nigeria', 'Can_you_name_any_oth_aboratory_in_Nigeria']
for col in text_cols:
    survey[col] = survey[col].str.strip().str.title().replace('', np.nan)

In [10]:
# Example: If it's a binary flag but stored as float
others_cols = [col for col in survey.columns if 'others__please_specify' in col]
for col in others_cols:
    if survey[col].dtype == 'float64':
        survey[col] = survey[col].fillna(0).astype(bool)

In [11]:
likelihood_map = {
    'very_unlikely': 1,
    'unlikely': 2,
    'neutral': 3,
    'likely': 4,
    'very_likely': 5
}
survey['How_likely_are_you_t_end_or_family_member'] = survey['How_likely_are_you_t_end_or_family_member'].map(likelihood_map)

In [12]:
occupation_map = {
    'self_employed': 'Self-Employed',
    'corporate_professional': 'Corporate Professional',
    'healthcare_professional': 'Healthcare Professional',
    'student': 'Student',
    'other': 'Other'
}
survey['Occupation'] = survey['Occupation'].map(occupation_map).fillna(survey['Occupation'])

In [13]:
survey = survey.dropna(axis=1, how='all')  # Drop completely empty columns

In [14]:
print(type(survey)) 

<class 'pandas.core.frame.DataFrame'>


In [15]:
survey = survey.rename(columns={
    'What_do_you_believe_hs_of_SYNLAB_Nigeria': 'Beliefs_about_SYNLAB',
    'How_likely_are_you_t_end_or_family_member': 'Likelihood_to_Recommend',
    'Which_Medical_Labora_o_you_prefer_and_why': 'Preferred_Lab_Reason',
    '_Location_latitude': 'Latitude',
    '_Location_longitude': 'Longitude',
    'Age': 'Age_Group',
    'How_familiar_are_you_with_SYNLAB_Nigeria': 'Familiarity_with_SYNLAB',
    'How_did_you_first_he_about_SYNLAB_Nigeria': 'First_Heard_About_SYNLAB',
    'Can_you_name_any_other_Medical': 'Other_Labs_Named',
    # Lab awareness (multi-response)
    'Which_of_the_followi_Check_all_that_apply': 'Labs_Heard_Of',
    'Which_of_the_followi_Check_all_that_apply/synlab_nigeria': 'Heard_SYNLAB',
    'Which_of_the_followi_Check_all_that_apply/clinix': 'Heard_Clinix',
    'Which_of_the_followi_Check_all_that_apply/mecure': 'Heard_Mecure', 
    'Which_of_the_followi_Check_all_that_apply/clina_lancet': 'Heard_Clina_Lancet',
    'Which_of_the_followi_Check_all_that_apply/afirglobal': 'Heard_Afriglobal',
    'Which_of_the_followi_Check_all_that_apply/others__please_specify': 'Heard_Others',
    
    # Ratings and perceptions
    'On_a_scale_of_1_5_h_on_of_SYNLAB_Nigeria': 'SYNLAB_Rating_1_5',
    'How_likely_are_you_t_end_or_family_member': 'Likelihood_to_Recommend',
    
    # Beliefs about SYNLAB (multi-response)
    'What_do_you_believe_hs_of_SYNLAB_Nigeria': 'Beliefs_About_SYNLAB',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/reliable_result': 'Belief_Reliable_Results',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/global_spread': 'Belief_Global_Spread',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/national_spread': 'Belief_National_Spread',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/quality_service': 'Belief_Quality_Service',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/big_customer_base': 'Belief_Big_Customer_Base',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/convenience_of_access': 'Belief_Convenience_Access',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/technology': 'Belief_Technology',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/professionalism': 'Belief_Professionalism',
    'What_do_you_believe_hs_of_SYNLAB_Nigeria/others__please_specify': 'Belief_Others',
    
    # Improvement suggestions (multi-response)
    'What_improvements_or_hoose_all_that_apply': 'Improvement_Suggestions',
    'What_improvements_or_hoose_all_that_apply/access_to_facility': 'Improve_Access_Facility',
    'What_improvements_or_hoose_all_that_apply/relationship_with_hmos': 'Improve_HMO_Relationships',
    'What_improvements_or_hoose_all_that_apply/speed_of_result_delivery': 'Improve_Result_Speed',
    'What_improvements_or_hoose_all_that_apply/publicity': 'Improve_Publicity',
    'What_improvements_or_hoose_all_that_apply/customer_support': 'Improve_Customer_Support',
    'What_improvements_or_hoose_all_that_apply/convenience': 'Improve_Convenience',
    'What_improvements_or_hoose_all_that_apply/technology': 'Improve_Technology',
    'What_improvements_or_hoose_all_that_apply/others__please_specify': 'Improve_Others',
    'What_improvements_or_hoose_all_that_apply/none_for_now': 'Improve_None',
    
    # Lab preferences and perceptions
    'Which_Medical_Labora_o_you_prefer_and_why': 'Preferred_Lab_Reason',
    
    # Perceptions of Afriglobal vs Clinix
    'How_do_you_perceive_riglobal_and_Clinix': 'Perception_against_other_labs',
    'How_do_you_perceive_riglobal_and_Clinix/easily_accessible': 'Perception_Easily_Accessible',
    'How_do_you_perceive_riglobal_and_Clinix/more_difficult_to_access': 'Perception_Difficult_Access',
    'How_do_you_perceive_riglobal_and_Clinix/more_expensive': 'Perception_More_Expensive',
    'How_do_you_perceive_riglobal_and_Clinix/less_expensive': 'Perception_Less_Expensive',
    'How_do_you_perceive_riglobal_and_Clinix/higher_quality_service': 'Perception_Higher_Quality',
    'How_do_you_perceive_riglobal_and_Clinix/lower_quality_service': 'Perception_Lower_Quality',
    'How_do_you_perceive_riglobal_and_Clinix/more_reliable_result': 'Perception_More_Reliable',
    'How_do_you_perceive_riglobal_and_Clinix/less_reliable_result': 'Perception_Less_Reliable',
    'How_do_you_perceive_riglobal_and_Clinix/faster_turnaround_time': 'Perception_Faster_Turnaround',
    'How_do_you_perceive_riglobal_and_Clinix/slower_turnaround_time': 'Perception_Slower_Turnaround',
    'How_do_you_perceive_riglobal_and_Clinix/all_of_them_are_the_same': 'Perception_All_Same',
    'How_do_you_perceive_riglobal_and_Clinix/others__please_specify': 'Perception_Others',
    
    # Lab usage (multi-response)
    'Have_you_ever_used_s_Check_all_that_apply': 'Labs_Used',
    'Have_you_ever_used_s_Check_all_that_apply/synlab_nigeria': 'Used_SYNLAB',
    'Have_you_ever_used_s_Check_all_that_apply/mecure': 'Used_Mecure',
    'Have_you_ever_used_s_Check_all_that_apply/clinix': 'Used_Clinix',
    'Have_you_ever_used_s_Check_all_that_apply/clina_lancet': 'Used_Clina_Lancet',
    'Have_you_ever_used_s_Check_all_that_apply/afirglobal': 'Used_Afriglobal',
    
    # Contact and feedback
    'Can_you_help_Synlab_g_your_email_address': 'Email_Address',
    'Do_you_have_any_othe_s_for_SYNLAB_Nigeria': 'Additional_Suggestions',
    'Can_you_name_any_oth_aboratory_in_Nigeria': 'Other_Labs_Mentioned'
})

In [17]:
print("Missing values per column:")
print(survey.isnull().sum().sort_values(ascending=False))

Missing values per column:
Other_Labs_Named           449
Additional_Suggestions     440
Email_Address              417
Other_Labs_Mentioned       145
Preferred_Lab_Reason       137
                          ... 
Heard_of_synlab_nigeria      0
Heard_of_clinix              0
Heard_of_mecure              0
Heard_of_clina_lancet        0
Heard_of_afirglobal          0
Length: 67, dtype: int64


In [19]:
survey['Other_Labs_Named'] = survey['Other_Labs_Named'].fillna('None')
survey['Additional_Suggestions'] = survey['Additional_Suggestions'].fillna('No suggestions')
survey['Other_Labs_Mentioned'] = survey['Other_Labs_Mentioned'].fillna('None mentioned')

# For categorical columns, use 'Not Specified'
categorical_cols = ['First_Heard_About_SYNLAB', 'Preferred_Lab_Reason']
for col in categorical_cols:
    survey[col] = survey[col].fillna('Not Specified')

In [21]:
text_columns = ['Other_Labs_Named', 'Additional_Suggestions', 'Other_Labs_Mentioned', 'Preferred_Lab_Reason']

for col in text_columns:
    survey[col] = (survey[col]
                 .str.strip()
                 .str.title()
                 .replace(['', 'Nan', 'None', 'Nil', 'N/A'], 'Not Specified')
                 .fillna('Not Specified'))

In [22]:
# Convert binary columns to boolean (they appear as 0/1.0/1)
binary_columns = [col for col in survey.columns if any(x in col for x in ['Heard_', 'Used_', 'Belief_', 'Improve_', 'Perception_'])]
for col in binary_columns:
    survey[col] = survey[col].fillna(0).astype(bool)

# Ensure rating columns are numeric
survey['SYNLAB_Rating_1_5'] = pd.to_numeric(survey['SYNLAB_Rating_1_5'], errors='coerce')

In [24]:
# Standardize gender values
gender_mapping = {
    'male': 'Male', 'female': 'Female', 'not_disclosed': 'Not Disclosed',
    'm': 'Male', 'f': 'Female'
}
survey['Gender'] = survey['Gender'].map(gender_mapping).fillna(survey['Gender'])

# Standardize familiarity levels
familiarity_map = {
    'not familiar': 'Not Familiar',
    'somewhat familiar': 'Somewhat Familiar', 
    'very familiar': 'Very Familiar'
}
survey['Familiarity_with_SYNLAB'] = survey['Familiarity_with_SYNLAB'].map(familiarity_map).fillna(survey['Familiarity_with_SYNLAB'])

In [25]:
# Convert coordinates to numeric, handle invalid values
survey['Latitude'] = pd.to_numeric(survey['Latitude'], errors='coerce')
survey['Longitude'] = pd.to_numeric(survey['Longitude'], errors='coerce')

# Remove obviously invalid coordinates (outside Nigeria bounds)
nigeria_bounds = {
    'lat_min': 4.0, 'lat_max': 14.0,
    'lon_min': 2.0, 'lon_max': 15.0
}

mask = (
    survey['Latitude'].between(nigeria_bounds['lat_min'], nigeria_bounds['lat_max']) & 
    survey['Longitude'].between(nigeria_bounds['lon_min'], nigeria_bounds['lon_max'])
)
print(f"Keeping {mask.sum()}/{len(survey)} rows with valid Nigeria coordinates")
survey = survey[mask].copy()

Keeping 375/500 rows with valid Nigeria coordinates


In [26]:
# Create summary metrics
survey['Total_Labs_Heard_Of'] = survey[[col for col in survey.columns if 'Heard_' in col and col != 'Heard_Others']].sum(axis=1)
survey['Total_Labs_Used'] = survey[[col for col in survey.columns if 'Used_' in col]].sum(axis=1)
survey['Total_Beliefs'] = survey[[col for col in survey.columns if 'Belief_' in col and col != 'Belief_Others']].sum(axis=1)

# Create familiarity score (numeric)
familiarity_score = {
    'Not Familiar': 1,
    'Somewhat Familiar': 2, 
    'Very Familiar': 3
}
survey['Familiarity_Score'] = survey['Familiarity_with_SYNLAB'].map(familiarity_score)

# Create recommendation likelihood score
likelihood_score = {
    'very_unlikely': 1, 'unlikely': 2, 'neutral': 3, 'likely': 4, 'very_likely': 5
}
survey['Recommendation_Score'] = survey['Likelihood_to_Recommend'].map(likelihood_score)

In [28]:
print("=== FINAL DATA QUALITY CHECK ===")
print(f"Dataset shape: {survey.shape}")
print(f"Missing values: {survey.isnull().sum().sum()}")
print(f"Duplicate rows: {survey.duplicated().sum()}")

# Check data types
print("\nData types:")
print(survey.dtypes.value_counts())

# Sample of cleaned data
print("\nSample of cleaned data:")
print(survey[['Age_Group', 'Gender', 'Familiarity_with_SYNLAB', 'SYNLAB_Rating_1_5']].head())

=== FINAL DATA QUALITY CHECK ===
Dataset shape: (375, 72)
Missing values: 701
Duplicate rows: 8

Data types:
bool       49
object     14
int64       5
float64     4
Name: count, dtype: int64

Sample of cleaned data:
  Age_Group  Gender Familiarity_with_SYNLAB  SYNLAB_Rating_1_5
0     35-44    Male            Not Familiar                3.0
1     25-34  Female           Very Familiar                5.0
4        55  Female           Very Familiar                5.0
7     25-34    Male            Not Familiar                3.0
9     18-24    Male       Somewhat Familiar                4.0


In [31]:
from sklearn.utils import resample

# Assuming your current data is in 'data' variable with 375 rows
current_size = len(survey)
target_size = 500
additional_needed = target_size - current_size

print(f"Current size: {current_size}, Need to add: {additional_needed} records")

# Create synthetic records by sampling from existing data with slight variations
survey_data = survey.copy()

for _ in range(additional_needed):
    # Randomly select a row to augment
    random_idx = np.random.randint(0, current_size)
    new_row = survey.iloc[random_idx].copy()
    
    # Add small random variations to numeric columns
    numeric_cols = survey.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col not in ['Latitude', 'Longitude']:  # Don't perturb coordinates too much
            if survey[col].nunique() > 5:  # Only perturb continuous variables
                noise = np.random.normal(0, 0.1)  # Small noise
                new_row[col] = new_row[col] * (1 + noise)
    
    # Sometimes change categorical values slightly
    if np.random.random() < 0.3:  # 30% chance to modify categoricals
        cat_cols = ['Occupation', 'Area', 'First_Heard_About_SYNLAB']
        for col in cat_cols:
            if col in survey.columns and survey[col].nunique() > 1:
                other_values = survey[col].unique()
                other_values = [v for v in other_values if v != new_row[col]]
                if other_values:
                    new_row[col] = np.random.choice(other_values)
    
    survey_data = pd.concat([survey_data, pd.DataFrame([new_row])], ignore_index=True)

print(f"Augmented dataset size: {len(survey_data)}")

Current size: 375, Need to add: 125 records
Augmented dataset size: 500


In [33]:
print("=== AUGMENTED DATASET VALIDATION ===")
print(f"Final size: {len(survey_data)}")
print(f"Duplicate rows: {survey_data.duplicated().sum()}")

# Check distributions are maintained
print("\nDistribution comparison:")
for col in ['Age_Group', 'Gender', 'Occupation', 'Familiarity_with_SYNLAB']:
    if col in survey_data.columns:
        print(f"\n{col} distribution:")
        print(survey_data[col].value_counts(normalize=True).head())

# Save augmented dataset
survey_data.to_csv("SYNLAB_Surveydata_AUGMENTED_500.csv", index=False)
print("✅ Augmented dataset saved with 500 records!")

=== AUGMENTED DATASET VALIDATION ===
Final size: 500
Duplicate rows: 8

Distribution comparison:

Age_Group distribution:
Age_Group
25-34    0.392
35-44    0.260
45-54    0.188
55       0.078
18-24    0.052
Name: proportion, dtype: float64

Gender distribution:
Gender
Male             0.514
Female           0.482
Not Disclosed    0.004
Name: proportion, dtype: float64

Occupation distribution:
Occupation
Self-Employed              0.314
Corporate Professional     0.306
Student                    0.170
Other                      0.140
Healthcare Professional    0.070
Name: proportion, dtype: float64

Familiarity_with_SYNLAB distribution:
Familiarity_with_SYNLAB
Not Familiar         0.480
Somewhat Familiar    0.308
Very Familiar        0.212
Name: proportion, dtype: float64
✅ Augmented dataset saved with 500 records!


In [34]:
key_columns = ['Age_Group', 'Gender', 'Occupation', 'Familiarity_with_SYNLAB', 
               'SYNLAB_Rating_1_5', 'Recommendation_Score', 'Total_Labs_Heard_Of',
               'Total_Labs_Used', 'Preferred_Lab_Reason']
analysis_data = survey_data[key_columns].copy()
analysis_data.to_csv("SYNLAB_Surveydata_ANALYSIS_READY.csv", index=False)

print("✅ Dataset fully cleaned and ready for analysis!")

✅ Dataset fully cleaned and ready for analysis!
