In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import time
import random
from urllib.parse import urljoin, urlparse, parse_qs

class ACSEEScraper:
    def __init__(self, base_url="https://onlinesys.necta.go.tz/results/2023/acsee/"):
        """
        Initialize the CSEE scraper with the base URL for results.
        
        Args:
            base_url: Base URL for CSEE results
        """
        self.base_url = base_url
        self.academics_url = "https://www.school.co.tz/" # s{school_no:04d}/academics"
        self.results_url = urljoin(self.base_url, "results/")
        self.index_url = urljoin(self.base_url, "indexfiles/")
        self.schools_data = []
    
    def get_school_codes_from_indexes(self):
        """
        Extract all school and center codes by scraping the index pages (A-Z).
        
        Returns:
            List of school and center codes (both 's' and 'p' prefixed)
        """
        all_codes = []
        
        # Letters for index pages (a through z)
        letters = "abcdefghijklmnopqrstuvwxyz"
        
        for letter in letters:
            index_page_url = urljoin(self.index_url, f"index_{letter}.htm")
            print(f"Fetching school codes from index page: {index_page_url}")
            
            try:
                # Add delay to be respectful to the server
                time.sleep(random.uniform(1, 2))
                
                response = requests.get(index_page_url)
                
                # Skip to next letter if page not found
                if response.status_code == 404:
                    print(f"Index page for letter '{letter}' not found, skipping...")
                    continue
                
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Find all links to school results
                links = soup.find_all('a', href=lambda href: href and ('.htm' in href))
                
                for link in links:
                    href = link.get('href')
                    
                    # Extract the school code from the href
                    # Format could be like '..\results\s5732.htm' or similar
                    code_match = re.search(r'[sp]\d+', href)
                    if code_match:
                        code = code_match.group(0)
                        all_codes.append(code)
                
                print(f"Found {len(links)} codes on index page {letter}.")
                
            except requests.exceptions.RequestException as e:
                print(f"Error fetching index page {letter}: {e}")
                continue
        
        print(f"Total school and center codes found: {len(all_codes)}")
        return all_codes
    
    def get_school_codes(self, start_code=None, end_code=None, from_indexes=True):
        """
        Get a list of school codes to scrape.
        
        Args:
            start_code: Optional starting school code
            end_code: Optional ending school code
            from_indexes: Whether to get codes from index pages
            
        Returns:
            List of school codes
        """
        if from_indexes:
            return self.get_school_codes_from_indexes()
        
        # Manual code generation as fallback
        if start_code and end_code:
            # Create a range of school codes from start to end
            prefix = start_code[0]  # 's' or 'p'
            return [f"{prefix}{i}" for i in range(int(start_code[1:]), int(end_code[1:])+1)]
        else:
            # Default to a small set of example codes including the one in the example
            return ["s5732", "s5733", "s5734", "p5433", "p1093", "p5932"]  # Add more codes as needed
    
    def scrape_school(self, school_code):
        """
        Scrape data for a specific school or private center.
        
        Args:
            school_code: The school code (e.g., 's5732' or 'p5433')
            
        Returns:
            Dictionary containing scraped school/center data
        """
        school_url = urljoin(self.results_url, f"{school_code}.htm")
        print(f"Scraping: {school_url}")
        
        try:
            # Add delay to be respectful to the server
            time.sleep(random.uniform(1, 3))
            
            response = requests.get(school_url)
            response.raise_for_status()  # Raise exception for HTTP errors
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Initialize as a school or private center based on code prefix
            is_private_center = school_code.startswith('p')
            
            # Extract school/center name
            school_name = None
            h3_tag = soup.find('h3')
            if h3_tag:
                school_name_text = h3_tag.get_text(strip=True)
                # Extract school name using regex to handle different formats
                code_pattern = r'[SP]\d+'
                school_name_match = re.search(f"{code_pattern}\\s+(.*)", school_name_text, re.IGNORECASE)
                if school_name_match:
                    school_name = school_name_match.group(1)
            
            # Initialize performance data
            performance_data = {
                'school_code': school_code,
                'school_name': school_name,
                'is_private_center': is_private_center,
                'region': None,
                'gpa': None,
                'grade': None,
                'passed_candidates': None
            }
            
            if is_private_center:
                # For private centers, extract gender-based division data
                self._extract_private_center_data(soup, performance_data)
            else:
                # For regular schools, extract standard performance data
                self._extract_school_data(soup, performance_data, school_code)
            
            return performance_data
            
        except requests.exceptions.RequestException as e:
            print(f"Error scraping {school_code}: {e}")
            return {
                'school_code': school_code,
                'error': str(e)
            }
        except Exception as e:
            print(f"Unexpected error scraping {school_code}: {e}")
            return {
                'school_code': school_code,
                'error': str(e)
            }
    
    def _extract_school_data(self, soup, performance_data, school_code):
        """
        Extract data specific to regular schools.
        
        Args:
            soup: BeautifulSoup object of the school page
            performance_data: Dictionary to populate with school data
        """
        # Extract school region
        region_row = soup.find('p', string=lambda t: t and 'EXAMINATION CENTRE REGION' in t)
        if region_row:
            region_cell = region_row.find_next('p')
            if region_cell:
                performance_data['region'] = region_cell.get_text(strip=True)
        
        # Find GPA and grade
        gpa_row = soup.find('p', string=lambda t: t and 'EXAMINATION CENTRE GPA' in t)
        if gpa_row:
            gpa_cell = gpa_row.find_next('p')
            if gpa_cell:
                gpa_text = gpa_cell.get_text(strip=True)
                # Extract GPA and grade using regex
                gpa_match = re.search(r'(\d+\.\d+)', gpa_text)
                grade_match = re.search(r'GRADE\s+([A-F])\s+\((.*?)\)', gpa_text)
                
                if gpa_match:
                    performance_data['gpa'] = float(gpa_match.group(1))
                if grade_match:
                    performance_data['grade'] = grade_match.group(1)
                    performance_data['grade_description'] = grade_match.group(2)
        
        # Find passed candidates
        passed_row = soup.find('p', string=lambda t: t and 'TOTAL PASSED CANDIDATES' in t)
        if passed_row:
            passed_cell = passed_row.find_next('p')
            if passed_cell:
                passed_text = passed_cell.get_text(strip=True)
                try:
                    performance_data['passed_candidates'] = int(passed_text)
                except ValueError:
                    pass  # Handle case where text is not a valid integer
        
        # Extract division statistics
        division_headers = ['REGIST', 'ABSENT', 'SAT', 'WITHHELD', 'NO-CA', 'CLEAN', 
                           'DIV I', 'DIV II', 'DIV III', 'DIV IV', 'DIV 0']
        
        # Find the division table
        div_summary_table = None
        for table in soup.find_all('table'):
            if table.find('p', string=lambda t: t and 'REGIST' in t):
                div_summary_table = table
                break
        
        if div_summary_table:
            # Extract data from the division summary table
            data_cells = div_summary_table.find_all('p', align='CENTER')
            
            # Skip header cells and process data cells
            data_values = []
            for cell in data_cells:
                cell_text = cell.get_text(strip=True)
                if cell_text and cell_text not in division_headers:
                    try:
                        data_values.append(int(cell_text))
                    except ValueError:
                        data_values.append(None)
            
            # Map the data values to the headers
            if len(data_values) == len(division_headers):
                for i, header in enumerate(division_headers):
                    performance_data[header.lower().replace(' ', '_')] = data_values[i]


            try:
                response = requests.get( urljoin(self.academics_url, f"{school_code}/academics") , timeout=10)
                response.raise_for_status() 
                soup = BeautifulSoup(response.content, 'html.parser')
                
                counts_container = soup.find('div', class_='count-staff-students')
                counts = {}
                if counts_container:
                    count_divs = counts_container.find_all('div', recursive=False)
                    for div in count_divs:
                        # Find the text before the colon
                        text_content = div.get_text(strip=True)
                        if ':' in text_content:
                            label, _ = text_content.split(':', 1)
                            # Clean up the label
                            label = label.replace('Number of', '').strip()
                            # Extract the value from the span
                            span_value = div.find('span')
                            if span_value:
                                clean_value = span_value.get_text(strip=True)
                                counts[label.upper()] = clean_value
                                
                performance_data["STUDENTS"] = counts.get("STUDENTS","Not Available")
                performance_data["TEACHERS"] = counts.get("TEACHERS","Not Available")
                performance_data["STUDENT-TEACHER RATIO"] = counts.get("STUDENT-TEACHER RATIO","Not Available")
                
            except Exception as e:
                print("Error scraping counts:", e)
    
    def _extract_private_center_data(self, soup, performance_data):
        """
        Extract data specific to private centers.
        
        Args:
            soup: BeautifulSoup object of the center page
            performance_data: Dictionary to populate with center data
        """
        # For private centers, we need to extract the gender-specific division data
        # Look for tables with specific structure
        
        # Find tables with division data by looking for "SEX", "I", "II", etc. in headers
        gender_division_table = None
        for table in soup.find_all('table'):
            if table.find('td', string=lambda t: t and 'SEX' in t):
                gender_division_table = table
                break
        
        if gender_division_table:
            # Find rows with F, M, T (female, male, total)
            rows = gender_division_table.find_all('tr')
            
            # Initialize counters for each gender and division
            for gender in ['F', 'M', 'T']:  # Female, Male, Total
                for div_num in ['i', 'ii', 'iii', 'iv', '0']:
                    performance_data[f'div_{div_num}_{gender.lower()}'] = None
            
            # Process each row to extract gender-based division data
            for row in rows:
                cells = row.find_all('td')
                if len(cells) >= 6:  # Should have SEX, I, II, III, IV, 0 columns
                    gender_cell = cells[0].get_text(strip=True)
                    if gender_cell in ['F', 'M', 'T']:
                        gender = gender_cell.lower()
                        
                        # Extract divisions (I, II, III, IV, 0)
                        try:
                            div_i = int(cells[1].get_text(strip=True))
                            div_ii = int(cells[2].get_text(strip=True))
                            div_iii = int(cells[3].get_text(strip=True))
                            div_iv = int(cells[4].get_text(strip=True))
                            div_0 = int(cells[5].get_text(strip=True))
                            
                            performance_data[f'div_i_{gender}'] = div_i
                            performance_data[f'div_ii_{gender}'] = div_ii
                            performance_data[f'div_iii_{gender}'] = div_iii
                            performance_data[f'div_iv_{gender}'] = div_iv
                            performance_data[f'div_0_{gender}'] = div_0
                            
                            # Calculate total candidates by gender
                            performance_data[f'total_{gender}'] = div_i + div_ii + div_iii + div_iv + div_0
                        except (ValueError, IndexError):
                            pass
    
    def scrape_all_schools(self, start_code=None, end_code=None, output_file="acsee_results_2024.csv", 
                          from_indexes=True, max_schools=None, school_types="both"):
        """
        Scrape data for all schools and save to CSV.
        
        Args:
            start_code: Optional starting school code
            end_code: Optional ending school code
            output_file: Filename for the output CSV
            from_indexes: Whether to get school codes from index pages
            max_schools: Maximum number of schools to scrape (None for all)
            school_types: Which types to scrape - "schools" (s), "centers" (p), or "both"
            
        Returns:
            DataFrame with all scraped data
        """
        school_codes = self.get_school_codes(start_code, end_code, from_indexes)
        
        # Filter codes based on school_types parameter
        if school_types == "schools":
            school_codes = [code for code in school_codes if code.startswith('s')]
        elif school_types == "centers":
            school_codes = [code for code in school_codes if code.startswith('p')]
        # For "both", use all codes
        
        # Limit the number of schools if specified
        if max_schools:
            school_codes = school_codes[:max_schools]
        
        total_codes = len(school_codes)
        print(f"Starting to scrape {total_codes} schools/centers")
        
        for i, code in enumerate(school_codes):
            school_data = self.scrape_school(code)
            if school_data:
                self.schools_data.append(school_data)
                
            # Save progress periodically and show progress
            if (i + 1) % 10 == 0 or (i + 1) == total_codes:
                progress_pct = (i + 1) / total_codes * 100
                print(f"Progress: {i + 1}/{total_codes} ({progress_pct:.1f}%) schools/centers scraped")
                
                df_temp = pd.DataFrame(self.schools_data)
                df_temp.to_csv(f"temp_{output_file}", index=False)
        
        # Create final DataFrame and save to CSV
        if self.schools_data:
            df = pd.DataFrame(self.schools_data)
            
            # Save separate files for schools and centers if both were scraped
            if school_types == "both" and len(df) > 0:
                schools_df = df[df['is_private_center'] == False]
                centers_df = df[df['is_private_center'] == True]
                
                if len(schools_df) > 0:
                    schools_filename = output_file.replace('.csv', '_schools.csv')
                    schools_df.to_csv(schools_filename, index=False)
                    print(f"School data saved to {schools_filename}")
                
                if len(centers_df) > 0:
                    centers_filename = output_file.replace('.csv', '_centers.csv')
                    centers_df.to_csv(centers_filename, index=False)
                    print(f"Center data saved to {centers_filename}")
            
            # Save combined data
            df.to_csv(output_file, index=False)
            print(f"All data successfully saved to {output_file}")
            return df
        else:
            print("No data collected.")
            return pd.DataFrame()

# Command line argument handling
if __name__ == "__main__":
    import sys
    
    # Check if running in Jupyter notebook/IPython
    is_jupyter = True if 'ipykernel' in sys.modules else False
    
    # For Jupyter notebook usage
    if is_jupyter:
        # You can modify these values directly when running in a notebook
        # Example parameter values for notebook usage
        from_indexes = True         # Get school codes from index pages
        start_code = None           # Optional starting school code (e.g., 's5000')
        end_code = None             # Optional ending school code (e.g., 's5100')
        output_file = 'data/acsee_results_2023.csv'  # Output CSV filename
        max_schools = None          # Maximum number of schools to scrape (None for all)
        school_types = 'both'       # Type of institutions to scrape: 'schools', 'centers', or 'both'
        input_file = None           # Input CSV file for analysis (instead of scraping)
        
        # Create scraper instance
        scraper = ACSEEScraper()
        
        # If input file is provided, load it for analysis
        if input_file and analyze:
            print(f"Loading data from {input_file} for analysis...")
            df = pd.read_csv(input_file)
            analysis_results = scraper.analyze_results(df)
            
            # Save analysis results to JSON
            analysis_file = input_file.replace('.csv', '_analysis.json')
            pd.DataFrame([analysis_results]).to_json(analysis_file, orient='records')
            print(f"Analysis results saved to {analysis_file}")
            
        # Otherwise, scrape data
        else:
            df = scraper.scrape_all_schools(
                start_code=start_code,
                end_code=end_code,
                output_file=output_file,
                from_indexes=from_indexes,
                max_schools=max_schools,
                school_types=school_types
            )
            
    # For command line usage
    else:
        import argparse
        
        parser = argparse.ArgumentParser(description='Scrape ACSEE 2024 examination results.')
        parser.add_argument('--from-indexes', action='store_true', help='Get school codes from index pages')
        parser.add_argument('--start-code', type=str, help='Starting school code (e.g., s5000)')
        parser.add_argument('--end-code', type=str, help='Ending school code (e.g., s5100)')
        parser.add_argument('--output', type=str, default='acsee_results_2023.csv', help='Output CSV filename')
        parser.add_argument('--max-schools', type=int, help='Maximum number of schools to scrape')
        parser.add_argument('--school-types', type=str, choices=['schools', 'centers', 'both'], 
                           default='both', help='Type of institutions to scrape: schools, centers or both')
        parser.add_argument('--analyze', action='store_true', help='Perform analysis on scraped data')
        parser.add_argument('--input', type=str, help='Input CSV file for analysis (instead of scraping)')
        
        args = parser.parse_args()
        
        scraper = CSEEScraper()
        
        # If input file is provided, load it for analysis
        if args.input and args.analyze:
            print(f"Loading data from {args.input} for analysis...")
            df = pd.read_csv(args.input)
            analysis_results = scraper.analyze_results(df)
            
            # Save analysis results to JSON
            analysis_file = args.input.replace('.csv', '_analysis.json')
            pd.DataFrame([analysis_results]).to_json(analysis_file, orient='records')
            print(f"Analysis results saved to {analysis_file}")
            
        # Otherwise, scrape data
        else:
            df = scraper.scrape_all_schools(
                start_code=args.start_code,
                end_code=args.end_code,
                output_file=args.output,
                from_indexes=args.from_indexes,
                max_schools=args.max_schools,
                school_types=args.school_types
            )

Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_a.htm
Found 83 codes on index page a.
Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_b.htm
Found 98 codes on index page b.
Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_c.htm
Found 51 codes on index page c.
Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_d.htm
Found 50 codes on index page d.
Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_e.htm
Found 45 codes on index page e.
Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_f.htm
Found 37 codes on index page f.
Fetching school codes from index page: https://onlinesys.necta.go.tz/results/2023/acsee/indexfiles/index_g.htm
Found 52 codes on index page g.

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import time
import random
from urllib.parse import urljoin, urlparse, parse_qs

df = pd.read_csv('data/acsee_results_2024.csv')
df

Unnamed: 0,school_code,school_name,is_private_center,region,gpa,grade,passed_candidates,regist,absent,sat,...,no-ca,clean,div_i,div_ii,div_iii,div_iv,div_0,STUDENTS,TEACHERS,STUDENT-TEACHER RATIO
0,s3470,ABBEY SECONDARY SCHOOL,False,MTWARA,2.8788,,11.0,11.0,0.0,11.0,...,0.0,11.0,1.0,9.0,1.0,0.0,0.0,460,40,12:1
1,s0784,AIRWING SECONDARY SCHOOL,False,DAR ES SALAAM,2.8927,,87.0,97.0,0.0,97.0,...,0.0,87.0,16.0,49.0,22.0,0.0,0.0,1250,80,16:1
2,s1682,AL-RIYAMI ACADEMY SECONDARY SCHOOL,False,MJINI MAGHARIBI,3.5889,,15.0,15.0,0.0,15.0,...,0.0,15.0,0.0,5.0,10.0,0.0,0.0,,,
3,s5394,ABDULRAHIM BUSOKA SECONDARY SCHOOL,False,SHINYANGA,2.2356,,133.0,134.0,1.0,133.0,...,0.0,133.0,81.0,47.0,5.0,0.0,0.0,,,
4,p1093,ALDERGATE SECONDARY SCHOOL CENTRE,True,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211,s1251,ZIBA SECONDARY SCHOOL,False,TABORA,1.6182,,196.0,198.0,2.0,196.0,...,0.0,196.0,196.0,0.0,0.0,0.0,0.0,570,22,26:1
1212,s0222,ZANAKI SECONDARY SCHOOL,False,DAR ES SALAAM,2.8966,,179.0,183.0,4.0,179.0,...,0.0,179.0,30.0,115.0,34.0,0.0,0.0,,,
1213,s5682,ZANZIBAR UNIVERSITY HIGH SCHOOL,False,MJINI MAGHARIBI,2.9898,,49.0,51.0,2.0,49.0,...,0.0,49.0,9.0,23.0,17.0,0.0,0.0,,,
1214,s3464,ZOGOWALE SECONDARY SCHOOL,False,PWANI,2.8214,,182.0,182.0,0.0,182.0,...,0.0,182.0,55.0,81.0,45.0,1.0,0.0,,,


In [4]:
df.drop(inplace=True,columns=['grade','is_private_center','passed_candidates','withheld','no-ca','region','STUDENTS','TEACHERS','div_i','div_ii','div_iii','div_iv','div_0','clean','sat','absent'])

In [5]:
df.dropna(axis=0,inplace=True)

In [6]:
df.reset_index()

Unnamed: 0,index,school_code,school_name,gpa,regist,STUDENT-TEACHER RATIO
0,0,s3470,ABBEY SECONDARY SCHOOL,2.8788,11.0,12:1
1,1,s0784,AIRWING SECONDARY SCHOOL,2.8927,97.0,16:1
2,13,s3914,ALFAGEMS SECONDARY SCHOOL,3.0131,299.0,35:1
3,15,s3532,ACACIA SECONDARY SCHOOL,2.3796,37.0,13:1
4,17,s1343,ANNE MARIE SECONDARY SCHOOL,2.6581,155.0,Not Available
...,...,...,...,...,...,...
189,1197,s5000,WAMA-NAKAYAMA SECONDARY SCHOOL,2.4016,61.0,11:1
190,1202,s4087,WIZA SECONDARY SCHOOL,2.7745,17.0,10:1
191,1204,s1474,WHITE LAKE SECONDARY SCHOOL,2.0797,23.0,16:1
192,1207,s2335,YOHANNES SECONDARY SCHOOL,2.1131,28.0,20:1


In [7]:
df = df[ df['STUDENT-TEACHER RATIO'] != 'Not Available' ]

In [8]:
df.reset_index(inplace=True)

In [9]:
df = df.rename(columns={'regist':'enlorment','STUDENT-TEACHER RATIO':'student_teacher_ratio'})
df

Unnamed: 0,index,school_code,school_name,gpa,enlorment,student_teacher_ratio
0,0,s3470,ABBEY SECONDARY SCHOOL,2.8788,11.0,12:1
1,1,s0784,AIRWING SECONDARY SCHOOL,2.8927,97.0,16:1
2,13,s3914,ALFAGEMS SECONDARY SCHOOL,3.0131,299.0,35:1
3,15,s3532,ACACIA SECONDARY SCHOOL,2.3796,37.0,13:1
4,19,s0182,AL-FAROUQ SEMINARY,3.0513,13.0,9:1
...,...,...,...,...,...,...
183,1197,s5000,WAMA-NAKAYAMA SECONDARY SCHOOL,2.4016,61.0,11:1
184,1202,s4087,WIZA SECONDARY SCHOOL,2.7745,17.0,10:1
185,1204,s1474,WHITE LAKE SECONDARY SCHOOL,2.0797,23.0,16:1
186,1207,s2335,YOHANNES SECONDARY SCHOOL,2.1131,28.0,20:1


In [10]:
df.columns

Index(['index', 'school_code', 'school_name', 'gpa', 'enlorment',
       'student_teacher_ratio'],
      dtype='object')

In [12]:
df.to_csv('data/acsee_clean_data_2023.csv',columns=['school_code', 'school_name', 'enlorment','student_teacher_ratio','gpa'],index=False)