In [3]:
import mysql.connector
import os
import pandas as pd

# Adding audit_report csv to the database

We have added the auditor report to the database and integrated it with the `visits` table. 
President Naledi appointed this auditor to investigate anomalous records identified in the `water_quality` table. This auditor re-visited a randomly selected subset of 1620 records, as provided by Chidi Kunto, with the aim of re-recording the quality_scores. The auditor also spoke to citizens close to the sources in order to understand why there were discrepancies.


Joining the audit report with visits table

In [9]:

# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()

# The query retrieves records by joining the auditor_report and visits tables on location_id, 
# returning the location_id (as both audit_location and visit_location), the true_water_source_score from auditor_report,
# and the record_id from visits.
query = """
    SELECT
        auditor_report.location_id AS audit_location,
        auditor_report.true_water_source_score,
        visits.location_id AS visit_location,
        visits.record_id
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id;
"""

# Execute the query
mycursor.execute(query)

# Fetch the results
results = mycursor.fetchall()

# Fetch column headers
column_headers = [desc[0] for desc in mycursor.description]

# Create a pandas DataFrame
df = pd.DataFrame(results, columns=column_headers)

# Print the DataFrame
print(df)

# Close the cursor and connection
mycursor.close()
cnx.close()


     audit_location  true_water_source_score visit_location  record_id
0         SoRu34980                        1      SoRu34980       5185
1         AkRu08112                        3      AkRu08112      59367
2         AkLu02044                        0      AkLu02044      37379
3         AkHa00421                        3      AkHa00421      51627
4         SoRu35221                        0      SoRu35221      28758
...             ...                      ...            ...        ...
2693      SoRu36378                        9      SoRu36378      19733
2694      KiRu27180                        6      KiRu27180      42498
2695      HaRu16981                        9      HaRu16981       2716
2696      HaDe16326                        1      HaDe16326      41884
2697      HaRu20394                        9      HaRu20394      24662

[2698 rows x 4 columns]


Now that we have the record_id for each location, our next step is to retrieve the corresponding scores from the water_quality table. We
are particularly interested in the subjective_quality_score. To do this, we'll JOIN the visits table and the water_quality table, using the
record_id as the connecting key.

In [11]:
# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()

# Define the query
query = """
    SELECT
        auditor_report.location_id AS audit_location,
        auditor_report.true_water_source_score,
        visits.location_id AS visit_location,
        visits.record_id,
        water_quality.subjective_quality_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id;
"""

# Execute the query
mycursor.execute(query)

# Fetch the results
results = mycursor.fetchall()

# Fetch column headers
column_headers = [desc[0] for desc in mycursor.description]

# Create a pandas DataFrame
df = pd.DataFrame(results, columns=column_headers)

# Print the DataFrame
print(df)

# Close the cursor and connection
mycursor.close()
cnx.close()


     audit_location  true_water_source_score visit_location  record_id  \
0         SoRu34980                        1      SoRu34980       5185   
1         AkRu08112                        3      AkRu08112      59367   
2         AkLu02044                        0      AkLu02044      37379   
3         AkHa00421                        3      AkHa00421      51627   
4         SoRu35221                        0      SoRu35221      28758   
...             ...                      ...            ...        ...   
2693      SoRu36378                        9      SoRu36378      19733   
2694      KiRu27180                        6      KiRu27180      42498   
2695      HaRu16981                        9      HaRu16981       2716   
2696      HaDe16326                        1      HaDe16326      41884   
2697      HaRu20394                        9      HaRu20394      24662   

      subjective_quality_score  
0                            1  
1                            3  
2           

It doesn't matter if  columns are in a different format, because we are about to clean this up a bit. Since it is a duplicate, we can drop one of
the location_id columns. Let's leave record_id and rename the scores to surveyor_score and auditor_score to make it clear which scores
we're looking at in the results set.

In [14]:
# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()
# Define the query
query = """
    SELECT
        auditor_report.location_id,
        visits.record_id,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id;
"""
# Execute the query
mycursor.execute(query)

# Fetch the results
results = mycursor.fetchall()

# Fetch column headers
column_headers = [desc[0] for desc in mycursor.description]

# Create a pandas DataFrame
df = pd.DataFrame(results, columns=column_headers)

# Print the DataFrame
print(df)

# Close the cursor and connection
mycursor.close()
cnx.close()


     location_id  record_id  auditor_score  surveyor_score
0      SoRu34980       5185              1               1
1      AkRu08112      59367              3               3
2      AkLu02044      37379              0               0
3      AkHa00421      51627              3               3
4      SoRu35221      28758              0               0
...          ...        ...            ...             ...
2693   SoRu36378      19733              9               9
2694   KiRu27180      42498              6               6
2695   HaRu16981       2716              9               9
2696   HaDe16326      41884              1               1
2697   HaRu20394      24662              9               9

[2698 rows x 4 columns]


A good starting point is to check if the auditor's and exployees' scores agree. There are many ways to do it. We can have a
WHERE clause and check if surveyor_score = auditor_score

In [15]:
# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()

# Define the query to check if the scores agree
query = """
    SELECT
        visits.record_id,
        auditor_report.location_id AS audit_location,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id
    WHERE
        auditor_report.true_water_source_score = water_quality.subjective_quality_score;
"""

# Execute the query
mycursor.execute(query)

# Fetch the results
results = mycursor.fetchall()

# Fetch column headers
column_headers = [desc[0] for desc in mycursor.description]

# Create a pandas DataFrame
df = pd.DataFrame(results, columns=column_headers)

# Print the DataFrame
print(df)

# Close the cursor and connection
mycursor.close()
cnx.close()



      record_id audit_location  auditor_score  surveyor_score
0          5185      SoRu34980              1               1
1         59367      AkRu08112              3               3
2         37379      AkLu02044              0               0
3         51627      AkHa00421              3               3
4         28758      SoRu35221              0               0
...         ...            ...            ...             ...
2500      19733      SoRu36378              9               9
2501      42498      KiRu27180              6               6
2502       2716      HaRu16981              9               9
2503      41884      HaDe16326              1               1
2504      24662      HaRu20394              9               9

[2505 rows x 4 columns]


We got 2505 rows, Some of the locations were visited multiple times, so these records are duplicated here. To fix it, we set visits.visit_count
= 1 in the WHERE clause. We make  sure we reference the alias we used for visits in the join.

In [16]:
# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()

# Define the query to check if the scores agree
query = """
    SELECT
        visits.record_id,
        auditor_report.location_id AS audit_location,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id
    WHERE
        auditor_report.true_water_source_score = water_quality.subjective_quality_score
        and visits.visit_count=1;
"""

# Execute the query
mycursor.execute(query)

# Fetch the results
results = mycursor.fetchall()

# Fetch column headers
column_headers = [desc[0] for desc in mycursor.description]

# Create a pandas DataFrame
df = pd.DataFrame(results, columns=column_headers)

# Print the DataFrame
print(df)

# Close the cursor and connection
mycursor.close()
cnx.close()



      record_id audit_location  auditor_score  surveyor_score
0          5185      SoRu34980              1               1
1         59367      AkRu08112              3               3
2         37379      AkLu02044              0               0
3         51627      AkHa00421              3               3
4         28758      SoRu35221              0               0
...         ...            ...            ...             ...
1513      19733      SoRu36378              9               9
1514      42498      KiRu27180              6               6
1515       2716      HaRu16981              9               9
1516      41884      HaDe16326              1               1
1517      24662      HaRu20394              9               9

[1518 rows x 4 columns]


With the duplicates removed I now get 1518. What does this mean considering the auditor visited 1620 sites?
I think that is an excellent result. 1518/1620 = 94% of the records the auditor checked were correct

But that means that 102 records are incorrect. So let's look at those.

In [20]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()

# Define the query to check mismatched scores, considering only the first visit
query_mismatch = """
    SELECT
        visits.record_id,
        auditor_report.location_id AS audit_location,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id
    WHERE
        auditor_report.true_water_source_score != water_quality.subjective_quality_score
        AND visits.visit_count = 1;
"""

# Execute the query for mismatches
mycursor.execute(query_mismatch)

# Fetch the results
results_mismatch = mycursor.fetchall()

# Get column names
column_names_mismatch = [desc[0] for desc in mycursor.description]

# Create a DataFrame
df_mismatch = pd.DataFrame(results_mismatch, columns=column_names_mismatch)

# Print the DataFrame
print("Records where surveyor's score does not match auditor's score (first visit only):")
print(df_mismatch)

# Check the count of mismatched records
print("\nNumber of mismatched records:", len(df_mismatch))

# Close the cursor and connection
mycursor.close()
cnx.close()


Records where surveyor's score does not match auditor's score (first visit only):
     record_id audit_location  auditor_score  surveyor_score
0        21160      AkRu05215              3              10
1         7938      KiRu29290              3              10
2        43140      KiHa22748              9              10
3        18495      SoRu37841              6              10
4        33931      KiRu27884              1              10
..         ...            ...            ...             ...
97       47831      AmPw12480              5              10
98       47055      AmRu14842              2              10
99       31888      AkRu03358              1              10
100      57735      AmRu13433              9              10
101      15929      AmAm09956              1              10

[102 rows x 4 columns]

Number of mismatched records: 102


Since we used some of this data in our previous analyses, we need to make sure those results are still valid, now we know some of them are
incorrect. We didn't use the scores that much, but we relied a lot on the type_of_water_source, so let's check if there are any errors there.

So, to do this, we need to grab the type_of_water_source column from the water_source table and call it survey_source, using the
source_id column to JOIN. Also select the type_of_water_source from the auditor_report table, and call it auditor_source.

In [35]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)
mycursor = cnx.cursor()

# Define the query to check discrepancies in type_of_water_source
query_type_discrepancies = """
    SELECT
    visits.record_id,
    auditor_report.location_id AS audit_location,
    auditor_report.type_of_water_source AS auditor_source,
    water_source.type_of_water_source AS survey_source,
    auditor_report.true_water_source_score AS auditor_score,
    water_quality.subjective_quality_score AS surveyor_score
FROM
    auditor_report
JOIN
    visits ON auditor_report.location_id = visits.location_id
JOIN
    water_source ON visits.source_id = water_source.source_id
JOIN
    water_quality ON visits.record_id = water_quality.record_id
WHERE
     auditor_report.true_water_source_score != water_quality.subjective_quality_score AND
     visits.visit_count = 1;

"""

# Execute the query for type discrepancies
mycursor.execute(query_type_discrepancies)

# Fetch the results
results_type_discrepancies = mycursor.fetchall()

# Get column names
column_names_type_discrepancies = [desc[0] for desc in mycursor.description]

# Create a DataFrame
df_type_discrepancies = pd.DataFrame(results_type_discrepancies, columns=column_names_type_discrepancies)

# Print the DataFrame
print("Records where auditor's type of water source does not match survey's type of water source:")
print(df_type_discrepancies)

# Check the count of discrepancies
print("\nNumber of discrepancies in type of water source:", len(df_type_discrepancies))

# Close the cursor and connection
mycursor.close()
cnx.close()


Records where auditor's type of water source does not match survey's type of water source:
     record_id audit_location      auditor_source       survey_source  \
0        21160      AkRu05215                well                well   
1         7938      KiRu29290          shared_tap          shared_tap   
2        43140      KiHa22748  tap_in_home_broken  tap_in_home_broken   
3        18495      SoRu37841          shared_tap          shared_tap   
4        33931      KiRu27884                well                well   
..         ...            ...                 ...                 ...   
97       47831      AmPw12480          shared_tap          shared_tap   
98       47055      AmRu14842                well                well   
99       31888      AkRu03358                well                well   
100      57735      AmRu13433  tap_in_home_broken  tap_in_home_broken   
101      15929      AmAm09956                well                well   

     auditor_score  surveyor_sco

So what I can see is that the types of sources look the same! So even though the scores are wrong, the integrity of the type_of_water_source
data we analysed last time is not affected.

let's look at where these errors may have come from. At some of the locations, employees assigned scores incorrectly, and those records
ended up in this results set.

I think there are two reasons this can happen.
1. These workers are all humans and make mistakes so this is expected.
2. Unfortunately, the alternative is that someone assigned scores incorrectly on purpose!


In either case, the employees are the source of the errors, so let's JOIN the assigned_employee_id for all the people on our list from the visits
table to our query. Remember, our query shows the shows the 102 incorrect records, so when we join the employee data, we can see which
employees made these incorrect records.

In [36]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query
query = """
    SELECT
        visits.location_id,
        visits.record_id,
        visits.assigned_employee_id,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id
    JOIN
        employee ON visits.assigned_employee_id = employee.assigned_employee_id
    WHERE
        auditor_report.true_water_source_score != water_quality.subjective_quality_score
        AND visits.visit_count = 1;
"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


    location_id  record_id  assigned_employee_id  auditor_score  \
0     AkRu05215      21160                    34              3   
1     KiRu29290       7938                     1              3   
2     KiHa22748      43140                     1              9   
3     SoRu37841      18495                    34              6   
4     KiRu27884      33931                     1              1   
..          ...        ...                   ...            ...   
97    AmPw12480      47831                     5              5   
98    AmRu14842      47055                     1              2   
99    AkRu03358      31888                     3              1   
100   AmRu13433      57735                     3              9   
101   AmAm09956      15929                     3              1   

     surveyor_score  
0                10  
1                10  
2                10  
3                10  
4                10  
..              ...  
97               10  
98               10

  df = pd.read_sql(query, con=cnx)


So now we can link the incorrect records to the employees who recorded them. The ID's don't help us to identify them. We have employees' names
stored along with their IDs, so let's fetch their names from the employees table instead of the ID's.

In [39]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query
query = """
    SELECT
        visits.location_id,
        visits.record_id,
        employee.employee_name,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id
    JOIN
        employee ON visits.assigned_employee_id = employee.assigned_employee_id
    WHERE
        auditor_report.true_water_source_score != water_quality.subjective_quality_score
        AND visits.visit_count = 1;
"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


    location_id  record_id   employee_name  auditor_score  surveyor_score
0     AkRu05215      21160      Rudo Imani              3              10
1     KiRu29290       7938     Bello Azibo              3              10
2     KiHa22748      43140     Bello Azibo              9              10
3     SoRu37841      18495      Rudo Imani              6              10
4     KiRu27884      33931     Bello Azibo              1              10
..          ...        ...             ...            ...             ...
97    AmPw12480      47831  Zuriel Matembo              5              10
98    AmRu14842      47055     Bello Azibo              2              10
99    AkRu03358      31888  Malachi Mavuso              1              10
100   AmRu13433      57735  Malachi Mavuso              9              10
101   AmAm09956      15929  Malachi Mavuso              1              10

[102 rows x 5 columns]


  df = pd.read_sql(query, con=cnx)


Well this query is massive and complex, so maybe it is a good idea to save this as a CTE, so when we do more analysis, we can just call that CTE
like it was a table.

In [49]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query with CTE
query = """
    WITH Incorrect_records AS (
        SELECT
            visits.location_id,
            visits.record_id,
            employee.employee_name,
            auditor_report.true_water_source_score AS auditor_score,
            water_quality.subjective_quality_score AS surveyor_score
        FROM
            auditor_report
        JOIN
            visits ON auditor_report.location_id = visits.location_id
        JOIN
            water_quality ON visits.record_id = water_quality.record_id
        JOIN
            employee ON visits.assigned_employee_id = employee.assigned_employee_id
        WHERE
            auditor_report.true_water_source_score != water_quality.subjective_quality_score
            AND visits.visit_count = 1
    )
    select * from Incorrect_records;

"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


    location_id  record_id   employee_name  auditor_score  surveyor_score
0     AkRu05215      21160      Rudo Imani              3              10
1     KiRu29290       7938     Bello Azibo              3              10
2     KiHa22748      43140     Bello Azibo              9              10
3     SoRu37841      18495      Rudo Imani              6              10
4     KiRu27884      33931     Bello Azibo              1              10
..          ...        ...             ...            ...             ...
97    AmPw12480      47831  Zuriel Matembo              5              10
98    AmRu14842      47055     Bello Azibo              2              10
99    AkRu03358      31888  Malachi Mavuso              1              10
100   AmRu13433      57735  Malachi Mavuso              9              10
101   AmAm09956      15929  Malachi Mavuso              1              10

[102 rows x 5 columns]


  df = pd.read_sql(query, con=cnx)


In [56]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
  user='root',
  password='',
  host='127.0.0.1',
  database='md_water_services',
  auth_plugin='mysql_native_password'
)

# Define the query to create the table 'incorrect' (modify schema as needed)
create_table_query = """
  CREATE TABLE IF NOT EXISTS incorrect_records(
    location_id VARCHAR(255),
    record_id INT,
    employee_name VARCHAR(255),
    auditor_score INT,
    surveyor_score INT
  );
"""

# Define the query to insert data
insert_query = """
  INSERT INTO incorrect_records (location_id, record_id, employee_name, auditor_score, surveyor_score)
  SELECT
    visits.location_id,
    visits.record_id,
    employee.employee_name,
    auditor_report.true_water_source_score AS auditor_score,
    water_quality.subjective_quality_score AS surveyor_score
  FROM
    auditor_report
  JOIN
    visits ON auditor_report.location_id = visits.location_id
  JOIN
    water_quality ON visits.record_id = water_quality.record_id
  JOIN
    employee ON visits.assigned_employee_id = employee.assigned_employee_id
  WHERE
    auditor_report.true_water_source_score != water_quality.subjective_quality_score
    AND visits.visit_count = 1;
"""

# Create a cursor object
cursor = cnx.cursor()

# Execute the CREATE TABLE query first
cursor.execute(create_table_query)
cnx.commit()

# Execute the INSERT query to populate the table
cursor.execute(insert_query)
cnx.commit()

# Print success message (optional)
print("Table 'incorrect' created and data inserted successfully!")

# Close the connection
cnx.close()


Table 'incorrect' created and data inserted successfully!


Now that we defined Incorrect_records, we can query it like any other table.

Let's first get a unique list of employees from this table.

In [57]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query with CTE
query = """
    
    select DISTINCT employee_name from Incorrect_records;

"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


     employee_name
0       Rudo Imani
1      Bello Azibo
2   Zuriel Matembo
3    Yewande Ebele
4    Jengo Tumaini
5        Farai Nia
6   Malachi Mavuso
7     Makena Thabo
8   Lalitha Kaburi
9      Gamba Shani
10     Enitan Zuri
11   Thandiwe Kito
12       Pili Zola
13       Usafi Ayo
14    Deka Osumare
15        Ona Sefu
16      Xola Uzuri


  df = pd.read_sql(query, con=cnx)


I got 17 employees

let's calculate how many mistakes each employee made. So basically we want to count how many times their name is in
Incorrect_records list, and then group them by name

In [61]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query with CTE
query = """
    
    select employee_name,COUNT(*) AS number_of_mistakes from Incorrect_records 
    GROUP BY employee_name ORDER BY number_of_mistakes DESC;

"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


     employee_name  number_of_mistakes
0      Bello Azibo                  26
1   Malachi Mavuso                  21
2   Zuriel Matembo                  17
3   Lalitha Kaburi                   7
4       Rudo Imani                   5
5        Farai Nia                   4
6      Enitan Zuri                   4
7    Yewande Ebele                   3
8    Jengo Tumaini                   3
9     Makena Thabo                   3
10     Gamba Shani                   3
11   Thandiwe Kito                   1
12       Pili Zola                   1
13       Usafi Ayo                   1
14    Deka Osumare                   1
15        Ona Sefu                   1
16      Xola Uzuri                   1


  df = pd.read_sql(query, con=cnx)


It looks like some of our surveyors are making a lot of "mistakes" while many of the other surveyors are only making a few.

Ok, so thinking about this a bit. How would we go about finding out if any of our employees are corrupt?

Let's say all employees make mistakes, if someone is corrupt, they will be making a lot of "mistakes", more than average, for example. But someone
could just be clumsy, so we should try to get more evidence...

Our auditor did say some of the things he heard on the streets were quite shady, and he recorded this in the statements column. Considering
both of these sources should give us a pretty reliable answer.

# Gathering Evidence

So let's try to find all of the employees who have an above-average number of mistakes.

In [64]:


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query to create the Incorrect_records CTE and get the data
query = """
WITH incorrect_records AS (
    SELECT
        visits.location_id,
        visits.record_id,
        employee.employee_name,
        auditor_report.true_water_source_score AS auditor_score,
        water_quality.subjective_quality_score AS surveyor_score
    FROM
        auditor_report
    JOIN
        visits ON auditor_report.location_id = visits.location_id
    JOIN
        water_quality ON visits.record_id = water_quality.record_id
    JOIN
        employee ON visits.assigned_employee_id = employee.assigned_employee_id
    WHERE
        auditor_report.true_water_source_score != water_quality.subjective_quality_score
        AND visits.visit_count = 1
)
SELECT
    employee_name,
    COUNT(*) AS number_of_mistakes
FROM
    incorrect_records
GROUP BY
    employee_name;
"""

# Execute the query and load the data into a Pandas DataFrame
error_count_df = pd.read_sql(query, con=cnx)

# Calculate the average number of mistakes
avg_error_count = error_count_df['number_of_mistakes'].mean()

# Get the suspect list
suspect_list = error_count_df[error_count_df['number_of_mistakes'] > avg_error_count]

# Print the suspect list
print("Employees with above-average mistakes:")
print(suspect_list)

# Close the connection
cnx.close()


Employees with above-average mistakes:
    employee_name  number_of_mistakes
1     Bello Azibo                  26
2  Zuriel Matembo                  17
6  Malachi Mavuso                  21
8  Lalitha Kaburi                   7


  error_count_df = pd.read_sql(query, con=cnx)


Let's start by cleaning up our code a bit. First, Incorrect_records is a result we'll be using for the rest of the analysis, but it makes the
query a bit less readable. So, let's convert it to a VIEW. We can then use it as if it was a table. It will make our code much simpler to read, but, it
comes at a cost. We can add comments to CTEs in our code, so if we return to that query a year later, we can read those comments and quickly
understand what Incorrect_records represents. If we save it as a VIEW, it is not as obvious. So we should add comments in places where we
use Incorrect_records.

So, replace WITH with CREATE VIEW like this, and I added the statements column

In [69]:

# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Create a cursor object
cursor = cnx.cursor()

# Drop the view if it exists
drop_view_query = "DROP VIEW IF EXISTS Incorrect_records;"
cursor.execute(drop_view_query)

# Define the query to create the Incorrect_records view
create_view_query = """
CREATE VIEW Incorrect_records AS
SELECT
    auditor_report.location_id,
    visits.record_id,
    employee.employee_name,
    auditor_report.true_water_source_score AS auditor_score,
    wq.subjective_quality_score AS surveyor_score,
    auditor_report.statements AS statements
FROM
    auditor_report
JOIN
    visits ON auditor_report.location_id = visits.location_id
JOIN
    water_quality AS wq ON visits.record_id = wq.record_id
JOIN
    employee ON employee.assigned_employee_id = visits.assigned_employee_id
WHERE
    visits.visit_count = 1
    AND auditor_report.true_water_source_score != wq.subjective_quality_score;
"""
cursor.execute(create_view_query)

# Define the query to get the data from the Incorrect_records view
query = """
SELECT
    employee_name,
    COUNT(*) AS number_of_mistakes
FROM
    Incorrect_records
GROUP BY
    employee_name;
"""

# Execute the query and load the data into a Pandas DataFrame
error_count_df = pd.read_sql(query, con=cnx)

# Calculate the average number of mistakes
avg_error_count = error_count_df['number_of_mistakes'].mean()

# Get the suspect list
suspect_list = error_count_df[error_count_df['number_of_mistakes'] > avg_error_count]

# Print the suspect list
print("Employees with above-average mistakes:")
print(suspect_list)

# Close the cursor and connection
cursor.close()
cnx.close()


Employees with above-average mistakes:
    employee_name  number_of_mistakes
1     Bello Azibo                  26
2  Zuriel Matembo                  17
6  Malachi Mavuso                  21
8  Lalitha Kaburi                   7


  error_count_df = pd.read_sql(query, con=cnx)


In [70]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query with CTE
query = """
    SELECT * FROM Incorrect_records;
"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


    location_id  record_id   employee_name  auditor_score  surveyor_score  \
0     AkRu05215      21160      Rudo Imani              3              10   
1     KiRu29290       7938     Bello Azibo              3              10   
2     KiHa22748      43140     Bello Azibo              9              10   
3     SoRu37841      18495      Rudo Imani              6              10   
4     KiRu27884      33931     Bello Azibo              1              10   
..          ...        ...             ...            ...             ...   
97    AmPw12480      47831  Zuriel Matembo              5              10   
98    AmRu14842      47055     Bello Azibo              2              10   
99    AkRu03358      31888  Malachi Mavuso              1              10   
100   AmRu13433      57735  Malachi Mavuso              9              10   
101   AmAm09956      15929  Malachi Mavuso              1              10   

                                            statements  
0    Villagers adm

  df = pd.read_sql(query, con=cnx)


Next, we convert the query error_count, we made earlier, into a CTE. Test it to make sure it gives the same result again, using SELECT * FROM
Incorrect_records. On large queries like this, it is better to build the query, and test each step, because fixing errors becomes harder as the
query grows.

In [72]:


# Establish a connection with the specified auth_plugin
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query with CTE
query = """
    WITH error_count AS ( -- This CTE calculates the number of mistakes each employee made
SELECT
employee_name,
COUNT(employee_name) AS number_of_mistakes
FROM
Incorrect_records

GROUP BY
employee_name)
SELECT * FROM error_count;
"""

# Execute the query and load into a pandas DataFrame
df = pd.read_sql(query, con=cnx)

# Print the DataFrame
print(df)

# Close the connection
cnx.close()


     employee_name  number_of_mistakes
0       Rudo Imani                   5
1      Bello Azibo                  26
2   Zuriel Matembo                  17
3    Yewande Ebele                   3
4    Jengo Tumaini                   3
5        Farai Nia                   4
6   Malachi Mavuso                  21
7     Makena Thabo                   3
8   Lalitha Kaburi                   7
9      Gamba Shani                   3
10     Enitan Zuri                   4
11   Thandiwe Kito                   1
12       Pili Zola                   1
13       Usafi Ayo                   1
14    Deka Osumare                   1
15        Ona Sefu                   1
16      Xola Uzuri                   1


  df = pd.read_sql(query, con=cnx)


calculating the average of the number_of_mistakes in error_count.

In [73]:


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Create a cursor object
cursor = cnx.cursor()

# Drop the view if it exists


# Define the query to calculate the average number of mistakes
query = """
WITH error_count AS (
    SELECT
        employee_name,
        COUNT(employee_name) AS number_of_mistakes
    FROM
        Incorrect_records
    GROUP BY
        employee_name
)
SELECT AVG(number_of_mistakes) AS avg_error_count_per_empl FROM error_count;
"""

# Execute the query and fetch the result
cursor.execute(query)
avg_error_count = cursor.fetchone()[0]

# Print the average error count
print(f"\nAverage Number of Mistakes: {avg_error_count}")

# Close the cursor and connection
cursor.close()
cnx.close()



Average Number of Mistakes: 6.0000


To find the employees who made more mistakes than the average person, we need the employee's names, the number of mistakes each one
made, and filter the employees with an above-average number of mistakes.

In [74]:

# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Create a cursor object
cursor = cnx.cursor()


# Define the query to find employees with above-average mistakes
query = """
WITH error_count AS (
    SELECT
        employee_name,
        COUNT(employee_name) AS number_of_mistakes
    FROM
        Incorrect_records
    GROUP BY
        employee_name
),
avg_error AS (
    SELECT AVG(number_of_mistakes) AS avg_error_count_per_empl FROM error_count
)
SELECT
    ec.employee_name,
    ec.number_of_mistakes
FROM
    error_count ec,
    avg_error ae
WHERE
    ec.number_of_mistakes > ae.avg_error_count_per_empl;
"""

# Execute the query and load the results into a pandas DataFrame
suspect_list_df = pd.read_sql(query, con=cnx)

# Print the suspect list
print("Employees with above-average mistakes:")
print(suspect_list_df)

# Close the cursor and connection
cursor.close()
cnx.close()


Employees with above-average mistakes:
    employee_name  number_of_mistakes
0     Bello Azibo                  26
1  Zuriel Matembo                  17
2  Malachi Mavuso                  21
3  Lalitha Kaburi                   7


  suspect_list_df = pd.read_sql(query, con=cnx)


These are the employees who made more mistakes, on average, than their peers.

We should look at the Incorrect_records table again, and isolate all of the records these four employees gathered. We should also look at the
statements for these records to look for patterns.

In [75]:


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)




# Define the query to get the records of the suspect employees
query = """
WITH error_count AS (
    SELECT
        employee_name,
        COUNT(employee_name) AS number_of_mistakes
    FROM
        Incorrect_records
    GROUP BY
        employee_name
),
avg_error AS (
    SELECT AVG(number_of_mistakes) AS avg_error_count_per_empl FROM error_count
),
suspect_list AS (
    SELECT
        ec.employee_name,
        ec.number_of_mistakes
    FROM
        error_count ec,
        avg_error ae
    WHERE
        ec.number_of_mistakes > ae.avg_error_count_per_empl
)
SELECT
    ir.location_id,
    ir.record_id,
    ir.employee_name,
    ir.auditor_score,
    ir.surveyor_score,
    ir.statements
FROM
    Incorrect_records ir
JOIN
    suspect_list sl ON ir.employee_name = sl.employee_name;
"""

# Execute the query and load the results into a pandas DataFrame
suspect_records_df = pd.read_sql(query, con=cnx)

# Print the suspect records
print("Records of employees with above-average mistakes:")
print(suspect_records_df)

# Close the cursor and connection
cursor.close()
cnx.close()


Records of employees with above-average mistakes:
   location_id  record_id   employee_name  auditor_score  surveyor_score  \
0    AmRu14842      47055     Bello Azibo              2              10   
1    HaYa21395      28729     Bello Azibo              2              10   
2    AkRu05741      19961     Bello Azibo              3              10   
3    AmRu14887      31028     Bello Azibo              9              10   
4    KiMr24919      23328     Bello Azibo              1              10   
..         ...        ...             ...            ...             ...   
66   SoIl32770       7548  Lalitha Kaburi              0              10   
67   AkRu07310      37457  Lalitha Kaburi              9              10   
68   AkRu04935      16410  Lalitha Kaburi              3              10   
69   SoKo33094      16159  Lalitha Kaburi              0              10   
70   KiIs24083      28404  Lalitha Kaburi              2              10   

                                     

  suspect_records_df = pd.read_sql(query, con=cnx)


let's add the statements column to the Incorrect_records view. Then pull up all of the records where the employee_name is in the
suspect list.

In [78]:


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)



# Define the query to get the suspect records
query = """
WITH error_count AS (
    SELECT
        employee_name,
        COUNT(employee_name) AS number_of_mistakes
    FROM
        Incorrect_records
    GROUP BY
        employee_name
),
avg_error AS (
    SELECT AVG(number_of_mistakes) AS avg_error_count_per_empl FROM error_count
),
suspect_list AS (
    SELECT
        ec.employee_name,
        ec.number_of_mistakes
    FROM
        error_count ec,
        avg_error ae
    WHERE
        ec.number_of_mistakes > ae.avg_error_count_per_empl
)
SELECT
    ir.employee_name,
    ir.location_id,
    ir.statements
FROM
    Incorrect_records ir
WHERE
    ir.employee_name IN (SELECT employee_name FROM suspect_list);
"""

# Execute the query and load the results into a pandas DataFrame
suspect_records_df = pd.read_sql(query, con=cnx)

# Print the suspect records
print("Records of employees with above-average mistakes:")
print(suspect_records_df)

# Close the cursor and connection
cursor.close()
cnx.close()


Records of employees with above-average mistakes:
     employee_name location_id  \
0      Bello Azibo   KiRu29290   
1      Bello Azibo   KiHa22748   
2      Bello Azibo   KiRu27884   
3   Zuriel Matembo   KiZu31170   
4      Bello Azibo   AkRu06495   
..             ...         ...   
66  Zuriel Matembo   AmPw12480   
67     Bello Azibo   AmRu14842   
68  Malachi Mavuso   AkRu03358   
69  Malachi Mavuso   AmRu13433   
70  Malachi Mavuso   AmAm09956   

                                           statements  
0   A young artist sketches the faces in the queue...  
1   A young girl's hopeful eyes are clouded by mis...  
2   A traditional healer's empathy turns to bitter...  
3   A community leader stood with his people, expr...  
4   A healthcare worker in the queue expressed fea...  
..                                                ...  
66  A village chief's dignity is marred by frustra...  
67  An elderly woman's weary eyes reflect the toll...  
68  A traveling trader's pause in the

  suspect_records_df = pd.read_sql(query, con=cnx)


If you have a look, you will notice some alarming statements about these four officials (look at these records: AkRu04508, AkRu07310,
KiRu29639, AmAm09607, for example. See how the word "cash" is used a lot in these statements.

we will filter the records tha have 'cash'

In [80]:


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define the query to get records mentioning "cash"
query = """
WITH error_count AS (
    SELECT
        employee_name,
        COUNT(employee_name) AS number_of_mistakes
    FROM
        Incorrect_records
    GROUP BY
        employee_name
),
avg_error AS (
    SELECT AVG(number_of_mistakes) AS avg_error_count_per_empl FROM error_count
),
suspect_list AS (
    SELECT
        ec.employee_name,
        ec.number_of_mistakes
    FROM
        error_count ec,
        avg_error ae
    WHERE
        ec.number_of_mistakes > ae.avg_error_count_per_empl
)
SELECT
    ir.employee_name,
    ir.location_id,
    ir.statements
FROM
    Incorrect_records ir
WHERE
    ir.employee_name IN (SELECT employee_name FROM suspect_list)
    AND ir.statements LIKE '%cash%';
"""

# Execute the query and load the results into a pandas DataFrame
cash_related_records_df = pd.read_sql(query, con=cnx)

# Print the filtered records
print("Records of employees with above-average mistakes that mention 'cash':")
print(cash_related_records_df)

# Close the connection
cnx.close()


Records of employees with above-average mistakes that mention 'cash':
     employee_name location_id  \
0      Bello Azibo   AkRu05741   
1      Bello Azibo   AmRu14887   
2      Bello Azibo   KiMr24919   
3      Bello Azibo   KiRu29639   
4      Bello Azibo   AkRu04508   
5      Bello Azibo   KiRu27065   
6      Bello Azibo   HaSe21323   
7      Bello Azibo   KiIs23853   
8   Zuriel Matembo   HaRu20146   
9   Zuriel Matembo   HaSe20888   
10  Zuriel Matembo   SoIl32575   
11  Zuriel Matembo   AkRu05880   
12  Zuriel Matembo   SoRu38331   
13  Malachi Mavuso   AmAm09956   
14  Malachi Mavuso   AmRu15719   
15  Malachi Mavuso   KiRu25347   
16  Malachi Mavuso   AmAm09607   
17  Lalitha Kaburi   KiRu29329   
18  Lalitha Kaburi   AkRu07310   

                                           statements  
0   An air of mistrust surrounded the official, as...  
1   Villagers expressed their discomfort with an o...  
2   Suspicion and unease colored the villagers' ac...  
3   An unsettling atmosph

  cash_related_records_df = pd.read_sql(query, con=cnx)


Check if there are any employees in the Incorrect_records table with statements mentioning "cash" that are not in our suspect list.

In [82]:

# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    user='root', 
    password='', 
    host='127.0.0.1', 
    database='md_water_services',
    auth_plugin='mysql_native_password'
)

# Define queries
query_incorrect_records = """
SELECT *
FROM Incorrect_records;
"""

query_suspect_list = """
WITH error_count AS (
    SELECT
        employee_name,
        COUNT(employee_name) AS number_of_mistakes
    FROM
        Incorrect_records
    GROUP BY
        employee_name
),
avg_error AS (
    SELECT AVG(number_of_mistakes) AS avg_error_count_per_empl FROM error_count
),
suspect_list AS (
    SELECT
        ec.employee_name,
        ec.number_of_mistakes
    FROM
        error_count ec,
        avg_error ae
    WHERE
        ec.number_of_mistakes > ae.avg_error_count_per_empl
)
SELECT employee_name
FROM suspect_list;
"""

# Execute the queries and load into Pandas DataFrames
incorrect_records_df = pd.read_sql(query_incorrect_records, con=cnx)
suspect_list_df = pd.read_sql(query_suspect_list, con=cnx)

# Get the list of suspect employees
suspect_list = suspect_list_df['employee_name'].tolist()

# Filter records mentioning 'cash'
cash_records_df = incorrect_records_df[incorrect_records_df['statements'].str.contains('cash', case=False, na=False)]

# Exclude records from suspect employees
non_suspect_cash_records_df = cash_records_df[~cash_records_df['employee_name'].isin(suspect_list)]

# Print the results
print("Records mentioning 'cash' not from suspect employees:")
print(non_suspect_cash_records_df)

# Close the connection
cnx.close()


Records mentioning 'cash' not from suspect employees:
Empty DataFrame
Columns: [location_id, record_id, employee_name, auditor_score, surveyor_score, statements]
Index: []


  incorrect_records_df = pd.read_sql(query_incorrect_records, con=cnx)
  suspect_list_df = pd.read_sql(query_suspect_list, con=cnx)


I get an empty result, so no one, except the four suspects, has these allegations of bribery.

So we can sum up the evidence we have for Zuriel Matembo, Malachi Mavuso, Bello Azibo and Lalitha Kaburi:
1. They all made more mistakes than their peers on average.
2. They all have incriminating statements made against them, and only them.
Keep in mind, that this is not decisive proof, but it is concerning enough that we should flag it. Pres. Naledi has worked hard to stamp out
corruption, so she would urge us to report this.