In [1]:
import altair as alt
import pandas as pa

test_data = pa.read_csv("test_scores.csv")
test_data


Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...
2128,ZOWMK,Urban,Public,ZBH,Standard,30.0,T8LSK,Female,Does not qualify,39.0,55.0
2129,ZOWMK,Urban,Public,ZBH,Standard,30.0,VNP26,Female,Qualifies for reduced/free lunch,38.0,46.0
2130,ZOWMK,Urban,Public,ZBH,Standard,30.0,YDR1Z,Female,Qualifies for reduced/free lunch,45.0,51.0
2131,ZOWMK,Urban,Public,ZBH,Standard,30.0,YUEIH,Male,Qualifies for reduced/free lunch,46.0,53.0


The two questions that I want to fid out from this data set are 
1. Do girls test higher than boys?
2. Do Non-public school students score better then public school students?

In [610]:
# shows the pretest scores for female students that go to public schools

female_test_scores = test_data.copy()

# List of schools from the data set to help sort the data
school_list = ['ANKYI', 'CCAAW', 'CIMBB', 'CUQAM', 'DNQDD', 'FBUMG', 'GJJHK',
       'GOKXL', 'GOOBU', 'IDGFP', 'KFZMY', 'KZKKE', 'LAYPA', 'OJOBU',
       'QOQTS', 'UAGPU', 'UKPGS', 'UUUQX', 'VHDHF', 'VKWQH', 'VVTVA',
       'ZMNYA', 'ZOWMK']

# add new columns to the data set so we can show the test scores as a % the name of this row is pretest_scores
female_test_scores = female_test_scores.assign(pretest_scores = female_test_scores.pretest / 100)

# filter the data so the rows that are seen are school, student_id, school_type, gender, and pretest_scores
female_test_scores = female_test_scores.filter(["school","student_id","school_type", "gender", "pretest_scores"])
female_test_scores_public = female_test_scores.query("gender == 'Female' & school_type == 'Public'")

# make a chart from the queryed data to make a bar chart that shows the test scores for each female student
female_public_pretest_chart = alt.Chart(female_test_scores_public, title = "Pretest scores female public schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school',
)

# draws an avg line showing the over all avg for female students in public schools so we can see what school did well and what school did not.
female_pretesting_avg_line = alt.Chart(female_test_scores_public).mark_rule(color = "red").encode(
    y = alt.Y("mean(pretest_scores):Q")
)

# draw text above the avg line saying where the avg line is at
avg_line_text = female_pretesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(pretest_scores):Q", format= (",.0%"))
)

# draws the test score above each student to you know what they got on the test
pretest_scores = female_public_pretest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("pretest_scores:Q", format= (",.0%")) 
)

female_public_pretest_chart + female_pretesting_avg_line + avg_line_text + pretest_scores



In [611]:

female_test_scores = test_data.copy()

# add new columns to the data set so we can show the test scores as a % the name of this row is posttest_scores
female_test_scores = female_test_scores.assign(posttest_scores = female_test_scores.posttest / 100)

# filter and queryy the data so we only see fmale students that go to public school for the posttest
female_test_scores = female_test_scores.filter(["school","student_id","school_type", "gender", "posttest_scores"])
female_test_scores_public = female_test_scores.query("gender == 'Female' & school_type == 'Public'")

# make a bar chart that shows each students scores
female_public_posttest_chart = alt.Chart(female_test_scores_public, title = "Posttest scores female public schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("posttest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school',
)

# draw an avg line for the avg score of all female students in public school on the posttest
female_posttesting_avg_line = alt.Chart(female_test_scores_public).mark_rule(color = "red").encode(
    y = alt.Y("mean(posttest_scores):Q")
)

# draw text that says where the avg line is
avg_line_text = female_posttesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(posttest_scores):Q", format= (",.0%"))
)

# draws the school for each student 
posttest_scores = female_public_posttest_chart.mark_text(
    angle = 90,
    baseline= "middle",
    dx = -15
).encode(
    text = alt.Text("posttest_scores:Q", format= (",.0%")) 
)

female_public_posttest_chart + female_posttesting_avg_line + avg_line_text + posttest_scores




In [612]:
female_test_scores = test_data.copy()

# add new columns to the data set so we can show the test scores as a % the name of this row is pretest_scores
female_test_scores = female_test_scores.assign(pretest_scores = female_test_scores.pretest / 100)

# filter the data so the rows that are seen are school, student_id, school_type, gender, and pretest_scores
female_test_scores = female_test_scores.filter(["school","student_id","school_type", "gender", "pretest_scores"])
female_test_scores_nonpublic = female_test_scores.query("gender == 'Female' & school_type == 'Non-public'")

# draws the chart that show the scores for each female that goes to nonpublic school on the pretest
female_nonpublic_pretest_chart = alt.Chart(female_test_scores_nonpublic, title = "Pretest scores female nonpublic schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school'
)

# draw the avg line for the scores of female students in non public schools
female_pretesting_avg_line = alt.Chart(female_test_scores_nonpublic).mark_rule(color = "red").encode(
    y = alt.Y("mean(pretest_scores):Q"),
    
)

# draw text for the avg line saying where the line is
avg_line_text = female_pretesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(pretest_scores):Q", format= (",.0%"))
)

# draw the score of each students scores
pretest_scores = female_nonpublic_pretest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("pretest_scores:Q", format= (",.0%")) 
)

female_nonpublic_pretest_chart + female_pretesting_avg_line + avg_line_text + pretest_scores

In [613]:
female_test_scores = test_data.copy()

# add new columns to the data set so we can show the test scores as a % the name of this row is posttest_scores
female_test_scores = female_test_scores.assign(posttest_scores = female_test_scores.posttest / 100)

# filter the data so the rows that are seen are school, student_id, school_type, gender, and posttest_scores
female_test_scores = female_test_scores.filter(["school","student_id","school_type", "gender", "posttest_scores"])
female_test_scores_nonpublic = female_test_scores.query("gender == 'Female' & school_type == 'Non-public'")

# draws the chart that show the scores for each female that goes to nonpublic school on the posttest
female_nonpublic_posttest_chart = alt.Chart(female_test_scores_nonpublic, title = "Posttest scores female nonpublic schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("posttest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school'
)

# draw the avg line for the scores of female students in non public schools
female_posttesting_avg_line = alt.Chart(female_test_scores_nonpublic).mark_rule(color = "red").encode(
    y = alt.Y("mean(posttest_scores):Q"),
    
)

# draw text for the avg line saying where the line is
avg_line_text = female_posttesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(posttest_scores):Q", format= (",.0%"))
)

# draw the score of each students scores
posttest_scores = female_nonpublic_posttest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("posttest_scores:Q", format= (",.0%")) 
)

female_nonpublic_posttest_chart + female_posttesting_avg_line + avg_line_text + posttest_scores

In [3]:
male_test_scores = test_data.copy()

# add a column that shows pretest scores in % format
male_test_scores = male_test_scores.assign(pretest_scores = male_test_scores.pretest / 100)

# query the data so that only male students from public  schools show up in the data
male_test_scores = male_test_scores.filter(["school","student_id","school_type", "gender", "pretest_scores"])
male_test_scores_public = male_test_scores.query("gender == 'Male' & school_type == 'Public'")

# male pretest public school chart showing the scores of all male students at each school
male_public_pretest_chart = alt.Chart(male_test_scores_public, title = "Pretest scores male public schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school',
)

# male pretesting avg line
male_pretesting_avg_line = alt.Chart(male_test_scores_public).mark_rule(color = "red").encode(
    y = alt.Y("mean(pretest_scores):Q")
)

# avg line text showing where the avg line sits
avg_line_text = male_pretesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(pretest_scores):Q", format= (",.0%"))
)

# shows the scores of each student
pretest_scores = male_public_pretest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("pretest_scores:Q", format= (",.0%")) 
)

male_public_pretest_chart + male_pretesting_avg_line + avg_line_text + pretest_scores

In [615]:
male_test_scores = test_data.copy()

male_test_scores = male_test_scores.assign(posttest_scores = male_test_scores.posttest / 100)

male_test_scores = male_test_scores.filter(["school","student_id","school_type", "gender", "posttest_scores"])
male_test_scores_public = male_test_scores.query("gender == 'Male' & school_type == 'Public'")

male_public_posttest_chart = alt.Chart(male_test_scores_public, title = "Posttest scores male public schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("posttest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school',
)

male_posttesting_avg_line = alt.Chart(male_test_scores_public).mark_rule(color = "red").encode(
    y = alt.Y("mean(posttest_scores):Q")
)

avg_line_text = male_posttesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(posttest_scores):Q", format= (",.0%"))
)

posttest_scores = male_public_posttest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("posttest_scores:Q", format= (",.0%")) 
)

male_public_posttest_chart + male_posttesting_avg_line + avg_line_text + posttest_scores

In [616]:
male_test_scores = test_data.copy()
# add a column that shows pretest scores in % format
male_test_scores = male_test_scores.assign(pretest_scores = male_test_scores.pretest / 100)

# query the data so that only male students from non-public  schools show up in the data
male_test_scores = male_test_scores.filter(["school","student_id","school_type", "gender", "pretest_scores"])
male_test_scores_nonpublic = male_test_scores.query("gender == 'Male' & school_type == 'Non-public'")

# male pretest non-public school chart showing the scores of all male students at each school
male_nonpublic_pretest_chart = alt.Chart(male_test_scores_nonpublic, title = "Pretest scores male public schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school',
)

# male pretesting avg line
male_pretesting_avg_line = alt.Chart(male_test_scores_nonpublic).mark_rule(color = "red").encode(
    y = alt.Y("mean(pretest_scores):Q")
)

# avg line text showing where the avg line sits
avg_line_text = male_pretesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(pretest_scores):Q", format= (",.0%"))
)

# shows the scores of each student
pretest_scores = male_nonpublic_pretest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("pretest_scores:Q", format= (",.0%")) 
)

male_nonpublic_pretest_chart + male_pretesting_avg_line + avg_line_text + pretest_scores

In [617]:
male_test_scores = test_data.copy()

male_test_scores = male_test_scores.assign(posttest_scores = male_test_scores.posttest / 100)

male_test_scores = male_test_scores.filter(["school","student_id","school_type", "gender", "posttest_scores"])
male_test_scores_nonpublic = male_test_scores.query("gender == 'Male' & school_type == 'Non-public'")

male_nonpublic_posttest_chart = alt.Chart(male_test_scores_nonpublic, title = "Posttest scores male nonpublic schools").mark_bar().encode(
    x= alt.X("student_id", title = "student_id", sort = school_list),
    y= alt.Y("posttest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color= 'school',
)

male_posttesting_avg_line = alt.Chart(male_test_scores_nonpublic).mark_rule(color = "red").encode(
    y = alt.Y("mean(posttest_scores):Q")
)

avg_line_text = male_posttesting_avg_line.mark_text(
    dx = -15
).encode(
    text = alt.Text("mean(posttest_scores):Q", format= (",.0%"))
)

posttest_scores = male_nonpublic_posttest_chart.mark_text(
    angle = 90,
    dx = -15
).encode(
    text = alt.Text("posttest_scores:Q", format= (",.0%")) 
)

male_nonpublic_posttest_chart + male_posttesting_avg_line + avg_line_text + posttest_scores

In [618]:
over_all_public_school = test_data.copy()

over_all_public_school = over_all_public_school.assign(public_pretest_scores = over_all_public_school.pretest /100)
over_all_public_school = over_all_public_school.filter(["school","student_id","school_type", "public_pretest_scores"])
over_all_public_school = over_all_public_school.query("school_type == 'Public'")

public_school_pretest_chart = alt.Chart(over_all_public_school).mark_bar().encode(
    x = alt.X("student_id", title = "student_id", sort = school_list),
    y = alt.Y("public_pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color = "school"
)

public_avg_line = alt.Chart(over_all_public_school).mark_rule(color = "red").encode(
    y = alt.Y("mean(public_pretest_scores)")
)

text_avg_line = public_avg_line.mark_text().encode(
    text = alt.Text("mean(public_pretest_scores):Q", format= (",.0%"))
)

public_pretest_student_scores = public_school_pretest_chart.mark_text(
    angle = 90,
    dx = -15
    ).encode(
    text = alt.Text("public_pretest_scores:Q", format= (",.0%"))
)

public_school_pretest_chart + public_avg_line + text_avg_line + public_pretest_student_scores

In [619]:
over_all_public_school = test_data.copy()

over_all_public_school = over_all_public_school.assign(public_pretest_scores = over_all_public_school.pretest /100)
over_all_public_school = over_all_public_school.filter(["school","student_id","school_type", "public_pretest_scores"])
over_all_public_school = over_all_public_school.query("school_type == 'Public'")

public_school_pretest_chart = alt.Chart(over_all_public_school).mark_bar().encode(
    x = alt.X("student_id", title = "student_id", sort = school_list),
    y = alt.Y("public_pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color = "school"
)

public_avg_line = alt.Chart(over_all_public_school).mark_rule(color = "red").encode(
    y = alt.Y("mean(public_pretest_scores)")
)

text_avg_line = public_avg_line.mark_text().encode(
    text = alt.Text("mean(public_pretest_scores):Q", format= (",.0%"))
)

public_pretest_student_scores = public_school_pretest_chart.mark_text(
    angle = 90,
    dx = -15
    ).encode(
    text = alt.Text("public_pretest_scores:Q", format= (",.0%"))
)

public_school_pretest_chart + public_avg_line + text_avg_line + public_pretest_student_scores

In [620]:
over_all_nonpublic_school = test_data.copy()

over_all_nonpublic_school = over_all_nonpublic_school.assign(nonpublic_pretest_scores = over_all_nonpublic_school.pretest /100)
over_all_nonpublic_school = over_all_nonpublic_school.filter(["school","student_id","school_type", "nonpublic_pretest_scores"])
over_all_nonpublic_school = over_all_nonpublic_school.query("school_type == 'Non-public'")

nonpublic_school_pretest_chart = alt.Chart(over_all_nonpublic_school).mark_bar().encode(
    x = alt.X("student_id", title = "student_id", sort = school_list),
    y = alt.Y("nonpublic_pretest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color = "school"
)

nonpublic_avg_line = alt.Chart(over_all_nonpublic_school).mark_rule(color = "red").encode(
    y = alt.Y("mean(nonpublic_pretest_scores)")
)

text_avg_line = nonpublic_avg_line.mark_text().encode(
    text = alt.Text("mean(nonpublic_pretest_scores):Q", format= (",.0%"))
)

nonpublic_pretest_student_scores = nonpublic_school_pretest_chart.mark_text(
    angle = 90,
    dx = -15
    ).encode(
    text = alt.Text("nonpublic_pretest_scores:Q", format= (",.0%"))
)

nonpublic_school_pretest_chart + nonpublic_avg_line + text_avg_line + nonpublic_pretest_student_scores

In [621]:
over_all_nonpublic_school = test_data.copy()

over_all_nonpublic_school = over_all_nonpublic_school.assign(nonpublic_posttest_scores = over_all_nonpublic_school.posttest /100)
over_all_nonpublic_school = over_all_nonpublic_school.filter(["school","student_id","school_type", "nonpublic_posttest_scores"])
over_all_nonpublic_school = over_all_nonpublic_school.query("school_type == 'Non-public'")

nonpublic_school_posttest_chart = alt.Chart(over_all_nonpublic_school).mark_bar().encode(
    x = alt.X("student_id", title = "student_id", sort = school_list),
    y = alt.Y("nonpublic_posttest_scores", title = "Test_scores %", axis=alt.Axis(format='%')),
    color = "school"
)

nonpublic_avg_line = alt.Chart(over_all_nonpublic_school).mark_rule(color = "red").encode(
    y = alt.Y("mean(nonpublic_posttest_scores)")
)

text_avg_line = nonpublic_avg_line.mark_text().encode(
    text = alt.Text("mean(nonpublic_posttest_scores):Q", format= (",.0%"))
)

nonpublic_posttest_student_scores = nonpublic_school_posttest_chart.mark_text(
    angle = 90,
    dx = -15
    ).encode(
    text = alt.Text("nonpublic_posttest_scores:Q", format= (",.0%"))
)

nonpublic_school_posttest_chart + nonpublic_avg_line + text_avg_line + nonpublic_posttest_student_scores

The answers that i got from the data set is that for question 1. Do girls score higher then boys the answer is no in this set of data the boys and the
girls scored at the same avg. for quesion 2. Do kids in nonpublic schools score higher the answer is yes The data shows that kids who go to privite school score 11% higher on the pretest and 12% higher on the post test.