# Educators questionnaire data analysis
- This notebook is an analysis of the data from the Educators questionnaire in the TTP survey dataset.
- The dataset is from 2004-2005
- Analyzing HIV-AIDS Educators and spatial data
- First cleaning the data and then analyzing it
- File Name = ELRC2004_2005_Educators_data.sav
- Number of variables = 609
- Number of cases = 21358

In [1]:
# import necessary libraries and we are using .sav data
import numpy as np
import pandas as pd
import pyreadstat
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression


In [2]:
# load the data
edu_data, meta = pyreadstat.read_sav("C:/Users/didit/OneDrive/Desktop/Research/ELRC2004_2005_Educators_data.sav")
edu_data.head()

Unnamed: 0,qnum,specty,prov,District,typinst,school,version,locinst,qstatus,qcom,...,effica,comm_a,comm_b,comm_c,comm_d,comm_e,comm,@total_,id,province
0,27357.0,3.0,6.0,1.0,2.0,1389.0,2.0,1.0,3.0,2.0,...,,,,,,,,1834.418571,1.0,North West
1,26081.0,3.0,6.0,1.0,2.0,1389.0,2.0,1.0,3.0,2.0,...,,,,,,,,1834.418571,1.0,North West
2,18073.0,3.0,6.0,1.0,2.0,1389.0,2.0,1.0,3.0,2.0,...,,,,,,,,1834.418571,1.0,North West
3,6300.0,,7.0,2.0,3.0,499.0,2.0,1.0,2.0,3.0,...,,,,,,,,1038.703704,2.0,Gauteng
4,6888.0,,7.0,2.0,3.0,499.0,2.0,1.0,1.0,3.0,...,,,,,,,,1038.703704,2.0,Gauteng


In [3]:
edu_data.columns

Index(['qnum', 'specty', 'prov', 'District', 'typinst', 'school', 'version',
       'locinst', 'qstatus', 'qcom',
       ...
       'effica', 'comm_a', 'comm_b', 'comm_c', 'comm_d', 'comm_e', 'comm',
       '@total_', 'id', 'province'],
      dtype='object', length=609)

In [4]:
edu_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21358 entries, 0 to 21357
Columns: 609 entries, qnum to province
dtypes: float64(604), object(5)
memory usage: 99.2+ MB


In [5]:
edu_data['q1_2'].value_counts()

q1_2
2.0    14215
1.0     6731
Name: count, dtype: int64

In [None]:
# change column names
#replace `version` col name to `Language`col name
edu_data = edu_data.rename(columns={'version': 'Language'})
edu_data = edu_data.rename(columns={'q1_1': 'AgeLastBirthday'})
edu_data = edu_data.rename(columns={'q1_2': 'RespondentSex'})
edu_data = edu_data.rename(columns={'q1_3': 'RespondentRace'})
edu_data = edu_data.rename(columns={'q1_4': 'RespondentNationality'})
edu_data = edu_data.rename(columns={'q1_5': 'RespondentMaritalStatus'})
edu_data = edu_data.rename(columns={'q1_6': 'LivingArrangements'})
edu_data = edu_data.rename(columns={'q1_7': 'HasDependentChildren'})
edu_data = edu_data.rename(columns={'q1_8': 'NumberOfDependentChildren'})
edu_data = edu_data.rename(columns={'q1_9': 'HasOtherDependants'})
edu_data = edu_data.rename(columns={'q1_10': 'NumberOfOtherDependants'})
edu_data = edu_data.rename(columns={'q2_1': 'YearofInitialTeacherTraining'})
edu_data = edu_data.rename(columns={'q2_2': 'PostTrainingQualification'})
edu_data = edu_data.rename(columns={'q2_4': 'MaritalStatusAtCareerStart'})
edu_data = edu_data.rename(columns={'q2_14': 'CurrentLivingArea'})
edu_data = edu_data.rename(columns={'q3_1': 'HighestEducationalQualification'})
edu_data = edu_data.rename(columns={'q3_2': 'PositionInSchool'})
edu_data = edu_data.rename(columns={'q3_3': 'PositionInFET'})
edu_data = edu_data.rename(columns={'q4_2aa': 'TeachingLevel'})
edu_data = edu_data.rename(columns={'q4_4': 'TeachingExperienceYears'})
edu_data = edu_data.rename(columns={'q5_1': 'KnowsEducatorsWithHIV'})
edu_data = edu_data.rename(columns={'q5_2': 'EducatorsDiedOfHIVLast2Years'})
edu_data = edu_data.rename(columns={'q5_4': 'KnowsLearnersWithHIV'})
edu_data = edu_data.rename(columns={'q5_7a': 'HIVLearenersIncreaseWorkLoad'})
edu_data = edu_data.rename(columns={'q5_7b': 'DifficultToTeachHIVLearners'})
edu_data = edu_data.rename(columns={'q5_8': 'KnowFamilyMembersWithHIV'})
edu_data = edu_data.rename(columns={'q5_9': 'CaringForFamilyMembersWithHIV'})
edu_data = edu_data.rename(columns={'q5_10': 'CaredForFamilyMembersWhoDiedofHIV'})
edu_data = edu_data.rename(columns={'q5_11a': 'HIVFamilyEffectOnWorkload'})
edu_data = edu_data.rename(columns={'q10_1': 'AlcoholConsumptionFrequency'})
edu_data = edu_data.rename(columns={'q11_1c': 'AmphetamineUse'})
edu_data = edu_data.rename(columns={'q11_1g': 'OpiatesUse'})
edu_data = edu_data.rename(columns={'q12_1e': 'EventsEducInstRape'})
edu_data = edu_data.rename(columns={'q12_1g': 'EventsEducInstSexualHarassment'})
edu_data = edu_data.rename(columns={'q13_1': 'Past12monthsSexualIntercoursePenetrative'})
edu_data = edu_data.rename(columns={'q13_2': 'NumberOfSexualPartnersInPast12Months'})
edu_data = edu_data.rename(columns={'q13_3a': 'Past12monthsSexWithMale'})
edu_data = edu_data.rename(columns={'q13_3b': 'Past12monthsSexWithFemale'})

# Map numeric codes to string values
sex_mapping = {1: 'M', 2: 'F'}
edu_data['RespondentSex'] = edu_data['RespondentSex'].map
race_mapping = {1: 'African', 2: 'White', 3: 'Colored', 4: 'Indian/Asian', 5: 'Other'}
edu_data['RespondentRace'] = edu_data['RespondentRace'].map
nationality_mapping = {1: 'SA citzen only', 2: 'SA citizen and other', 3: 'Non-SA citizen'}
edu_data['RespondentNationality'] = edu_data['RespondentNationality'].map
marital_status_mapping = {1: 'Married-civil:mag', 2: 'Married-traditional', 3: 'Married-religious', 4: 'Married-ct', 5: 'Married-cr', 6: 'Single',
                          7: 'Married-separated', 8:'Divorced', 9: 'Courtship', 10: 'Widowed', 11: 'Other'}
edu_data['RespondentMaritalStatus'] = edu_data['RespondentMaritalStatus'].map

living_arrangements_mapping = {
    1: 'Alone',
    2: 'With family/relatives',
    3: 'With partner or spouse',
    4: 'With peers/friends/co-workers'
}

edu_data['LivingArrangements'] = edu_data['LivingArrangements'].map(living_arrangements_mapping)
has_dependent_children_mapping = {1: 'Yes', 2: 'No'}
edu_data['HasDependentChildren'] = edu_data['HasDependentChildren'].map(has_dependent_children_mapping)
has_other_dependants_mapping = {1: 'Yes', 2: 'No'}
edu_data['HasOtherDependants'] = edu_data['HasOtherDependants'].map(has_other_dependants_mapping)
post_training_location_mapping = {1: 'Stayed in same area', 2: 'Moved to different area'}
edu_data['PostTrainingLocation'] = edu_data['PostTrainingLocation'].map(post_training_location_mapping)
marital_status_at_career_start_mapping = {1: 'Married', 2: 'Engaged', 3: 'Single'}
edu_data['MaritalStatusAtCareerStart'] = edu_data['MaritalStatusAtCareerStart'].map(marital_status_at_career_start_mapping)
current_living_area_mapping = {1: 'Urban', 2: 'Non-urban'}
edu_data['CurrentLivingArea'] = edu_data['CurrentLivingArea'].map(current_living_area_mapping)
highest_educational_qualification_mapping = {
    1: 'Doctorate/s',
    2: 'Honours/Masters degree',
    3: 'First degree/Higher diplomas',
    4: 'Diplomas/Occupational certificate',
    5: 'Grade 12/Std 10/Matric/N3 without teachers qualification',
    6: 'Grade 8,9,10 or 11 plus teachers qualification',
    7: 'Std 9/Grade 11/N2',
    8: 'Std 8/Grade 10/N1',
    9: 'Up to Std 7/Grade 9'
}
edu_data['HighestEducationalQualification'] = edu_data['HighestEducationalQualification'].map(highest_educational_qualification_mapping)
position_in_school_mapping = {
    1: 'Teacher',
    2: 'Senior Teacher',
    3: 'Education Specialist',
    4: 'Dep. Principal/Principal'
}

edu_data['PositionInSchool'] = edu_data['PositionInSchool'].map(position_in_school_mapping)
position_in_fet_mapping = {
    1: 'Lecturer',
    2: 'Senior Lecturer',
    3: 'Head of Division',
    4: 'Dep. Principal/Principal'
}
edu_data['PositionInFET'] = edu_data['PositionInFET'].map(position_in_fet_mapping)
teaching_level_mapping = {
    1: 'Pre-primary',
    2: 'Junior Primary',
    3: 'Senior Primary',
    4: 'Primary',
    5: 'Juior Secondary',
    6: 'Senior Secondary',
    7: 'Secondary',
    8: 'Intermediary',
    9: 'Foundation',
    10: 'College',
    11: 'University',
    97: 'FET'
}
edu_data['TeachingLevel'] = edu_data['TeachingLevel'].map(teaching_level_mapping)
#ensureign the TeachingExperienceYears column is numeric
edu_data['TeachingExperienceYears'] = pd.to_numeric(edu_data['TeachingExperienceYears'], errors='coerce')
knows_educators_with_hiv_mapping = {1: 'Yes', 2: 'No'}
edu_data['KnowsEducatorsWithHIV'] = edu_data['KnowsEducatorsWithHIV'].map(knows_educators_with_hiv_mapping)
educators_died_of_hiv_last_2_years_mapping = {1: 'Yes', 2: 'No'}
edu_data['EducatorsDiedOfHIVLast2Years'] = edu_data['EducatorsDiedOfHIVLast2Years'].map(educators_died_of_hiv_last_2_years_mapping)
knows_learners_with_hiv_mapping = {1: 'Yes', 2: 'No'}
edu_data['KnowsLearnersWithHIV'] = edu_data['KnowsLearnersWithHIV'].map(knows_learners_with_hiv_mapping)
hiv_learners_increase_work_load_mapping = {1: 'Disagree', 2: 'Unsure', 3: 'Agree'}
edu_data['HIVLearenersIncreaseWorkLoad'] = edu_data['HIVLearenersIncreaseWorkLoad'].map(hiv_learners_increase_work_load_mapping)
difficult_to_teach_hiv_learners_mapping = {1: 'Disagree', 2: 'Unsure', 3: 'Agree'}
edu_data['DifficultToTeachHIVLearners'] = edu_data['DifficultToTeachHIVLearners'].map(difficult_to_teach_hiv_learners_mapping)
know_family_members_with_hiv_mapping = {1: 'Yes', 2: 'No'}
edu_data['KnowFamilyMembersWithHIV'] = edu_data['KnowFamilyMembersWithHIV'].map(know_family_members_with_hiv_mapping)
caring_for_family_members_with_hiv_mapping = {1: 'Yes', 2: 'No'}
edu_data['CaringForFamilyMembersWithHIV'] = edu_data['CaringForFamilyMembersWithHIV'].map(caring_for_family_members_with_hiv_mapping)
cared_for_family_members_who_died_of_hiv_mapping = {1: 'Yes', 2: 'No'}
edu_data['CaredForFamilyMembersWhoDiedofHIV'] = edu_data['CaredForFamilyMembersWhoDiedofHIV'].map(cared_for_family_members_who_died_of_hiv_mapping)
hiv_family_effect_on_workload_mapping = {1: 'Disagree', 2: 'Unsure', 3: 'Agree'}
edu_data['HIVFamilyEffectOnWorkload'] = edu_data['HIVFamilyEffectOnWorkload'].map(hiv_family_effect_on_workload_mapping)
alcohol_consumption_frequency_mapping = {1: 'Not in past 12 months', 2: 'Once a month or less', 3: '2-4 times a month', 4: '2-3 times a week', 5: '4 or more times a weekly'}
edu_data['AlcoholConsumptionFrequency'] = edu_data['AlcoholConsumptionFrequency'].map(alcohol_consumption_frequency_mapping)
amphetamine_use_mapping = {1: 'Yes', 2: 'No'}
edu_data['AmphetamineUse'] = edu_data['AmphetamineUse'].map(amphetamine_use_mapping)
opiates_use_mapping = {1: 'Yes', 2: 'No'}
edu_data['OpiatesUse'] = edu_data['OpiatesUse'].map(opiates_use_mapping)
# Mapping for Q12.1e - Events at educational institution in the last 12 months: Someone was raped at the institution
events_educ_inst_rape_mapping = {1: 'Yes', 2: 'No'}
edu_data['EventsEducInstRape'] = edu_data['q12_1e'].map(events_educ_inst_rape_mapping)
# Mapping for Q12.1g - Events at educational institution in the last 12 months: Someone was sexually harassed at the institution
events_educ_inst_sexual_harassment_mapping = {1: 'Yes', 2: 'No'}
edu_data['EventsEducInstSexualHarassment'] = edu_data['q12_1g'].map(events_educ_inst_sexual_harassment_mapping)
# Mapping for Q13.1 - In the past 12 months, have you had sexual intercourse (penetrative)?
past_12_months_sexual_intercourse_penetrative_mapping = {1: 'Yes', 2: 'No'}
edu_data['Past12monthsSexualIntercoursePenetrative'] = edu_data['q13_1'].map(past_12_months_sexual_intercourse_penetrative_mapping)
# Mapping for Q13.2 - If yes, how many sexual partners have you had in the past 12 months?
edu_data['NumberOfSexualPartnersInPast12Months'] = pd.to_numeric(edu_data['q13_2'], errors='coerce')
# Mapping for Q13.3a - In the past 12 months, have you had sexual intercourse with only man or men
past_12_months_sexual_intercourse_with_male_mapping = {1: 'Yes', 2: 'No'}
edu_data['Past12monthsSexWithMale'] = edu_data['q13_3a'].map(past_12_months_sexual_intercourse_with_male_mapping)
# Mapping for Q13.3b - In the past 12 months, have you had sexual intercourse with only woman or women
past_12_months_sexual_intercourse_with_female_mapping = {1: 'Yes', 2: 'No'}
edu_data['Past12monthsSexWithFemale'] = edu_data['q13_3b'].map(past_12_months_sexual_intercourse_with_female_mapping)

#### What is the distribution of the highest educational qualification among teachers in the dataset?

In [None]:
qualification_distribution = edu_data['HighestEducationalQualification'].value_counts()
qualification_distribution.plot(kind='bar')
plt.title('Distribution of Highest Educational Qualification')
plt.xlabel('Qualification')
plt.ylabel('Number of Educators')
plt.show()

#### What is the percentage of teachers who know of learners at their institution living with HIV/AIDS?