#### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
unis = pd.read_csv('cleaned_university_data.csv')

In [3]:
unis.shape

(1302, 25)

In [4]:
unis.columns

Index(['college', 'state_code', 'state_name', 'region', 'division',
       'public_or_private', 'number_applications_received',
       'number_applications_accepted', 'number_applications_not_accepted',
       'percentage_applications_accepted', 'number_new_students_enrolled',
       'percentage_new_students_from_top_10',
       'percentage_new_students_from_top_25', 'number_ft_undergrad',
       'number_pt_undergrad', 'in_state_tuition', 'out_of_state_tuition',
       'room', 'board', 'additional_fees', 'estimated_book_costs',
       'estimated_personal_spending', 'percentage_faculty_with_phd',
       'student_faculty_ratio', 'graduation_rate'],
      dtype='object')

*---------------------------------------------------------------------------------------------------------------------------------------------*

In [5]:
unis.groupby('state_name')['state_name'].count().sort_values(ascending=False).head()

state_name
New York         101
Pennsylvania      83
California        70
Texas             60
Massachusetts     56
Name: state_name, dtype: int64

In [6]:
unis.groupby('state_name')['graduation_rate'].agg(['mean', 'median', 'count'])\
.round(2).sort_values(by=['mean', 'median'], ascending=False).reset_index().head()

Unnamed: 0,state_name,mean,median,count
0,Rhode Island,76.62,76.5,8
1,District of Columbia,75.5,73.5,6
2,Connecticut,73.88,76.0,17
3,Pennsylvania,73.64,73.0,81
4,Massachusetts,72.38,73.0,52


In [7]:
# Average Tuition by State and Public/Private Status
unis['public_or_private'] = unis['public_or_private'].replace({1: 'Public', 2: 'Private'})
unis.groupby(['state_name', 'public_or_private'])\
[['in_state_tuition', 'out_of_state_tuition']].mean().round(2).reset_index().head()

Unnamed: 0,state_name,public_or_private,in_state_tuition,out_of_state_tuition
0,Alabama,private,7040.33,7040.33
1,Alabama,public,1952.77,3922.31
2,Alaska,private,7560.0,7560.0
3,Alaska,public,1742.0,5226.0
4,Arizona,private,8335.0,8335.0


In [8]:
tuition_diff = \
unis.groupby(['state_name', 'public_or_private'])\
[['in_state_tuition', 'out_of_state_tuition']].mean().round(2).reset_index()

tuition_diff['Tuition_Difference'] = tuition_diff['out_of_state_tuition'] \
- tuition_diff['in_state_tuition']


tuition_diff.columns = ['state_name', 'public_or_private', 'in_state_tuition', 
                        'out_of_state_tuition', 'tuition_difference']

tuition_diff.sort_values(by='tuition_difference', ascending=False).head(10)

Unnamed: 0,state_name,public_or_private,in_state_tuition,out_of_state_tuition,tuition_difference
67,North Carolina,public,793.21,6959.36,6166.15
59,New Hampshire,public,2910.0,8973.33,6063.33
9,California,public,2588.5,8646.63,6058.13
79,Rhode Island,public,2687.0,8512.0,5825.0
93,Virginia,public,3061.8,8644.8,5583.0
91,Vermont,public,4087.5,9537.0,5449.5
5,Arizona,public,1828.0,7204.67,5376.67
95,Washington,public,2385.67,7672.67,5287.0
11,Colorado,public,1792.09,7066.73,5274.64
31,Iowa,public,2291.0,7298.67,5007.67


*---------------------------------------------------------------------------------------------------------------------------------------------*

In [9]:
# Average Graduation Rate by State
unis.groupby('state_name')['graduation_rate'].mean()\
.round(2).sort_values(ascending=False).reset_index().head()

Unnamed: 0,state_name,graduation_rate
0,Rhode Island,76.62
1,District of Columbia,75.5
2,Connecticut,73.88
3,Pennsylvania,73.64
4,Massachusetts,72.38


In [28]:
unis.groupby('state_name')['graduation_rate'].mean()\
.round(2).sort_values(ascending=False).reset_index().tail()

Unnamed: 0,state_name,graduation_rate
46,Hawaii,42.5
47,New Mexico,42.0
48,Oklahoma,40.5
49,Nevada,40.0
50,Alaska,27.0


In [10]:
# Graduation Rate by Institution Type (Public vs. Private)
unis.groupby('public_or_private')['graduation_rate'].mean().round(2).reset_index()

Unnamed: 0,public_or_private,graduation_rate
0,private,66.17
1,public,50.18


In [64]:
# Graduation Rate by State and Institution Type (Public vs. Private)
unis.groupby(['state_name', 'region', 'public_or_private'])\
['graduation_rate'].mean().round(2).unstack().head()

Unnamed: 0_level_0,public_or_private,private,public
state_name,region,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,South,51.82,44.15
Alaska,West,15.0,39.0
Arizona,West,81.0,46.0
Arkansas,South,66.0,40.29
California,West,63.15,55.81


In [65]:
public_vs_private = unis.groupby(['state_name', 'region', 'public_or_private'])\
['graduation_rate'].mean().round(2).unstack()
public_vs_private['private_gr_gt_public_gr'] = \
public_vs_private['private'] > public_vs_private['public']

#public_vs_private
len(public_vs_private[public_vs_private['private_gr_gt_public_gr'] == False])
public_vs_private[public_vs_private['private_gr_gt_public_gr'] == False]

Unnamed: 0_level_0,public_or_private,private,public,private_gr_gt_public_gr
state_name,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alaska,West,15.0,39.0,False
District of Columbia,South,75.5,,False
Hawaii,West,,42.5,False
Iowa,Midwest,63.2,68.0,False
Michigan,Midwest,55.85,58.27,False
Nevada,West,29.0,45.5,False
South Dakota,Midwest,51.4,51.75,False
Utah,West,41.5,44.67,False
Wyoming,West,,45.0,False


In [67]:
# Hawaii apparently has private universities
# Wyoming appearently has only one private university: 
# A for-profit tribal college in Fort Washakie

# The University of the District of Columbia is the only public institution in Washington, 
# DC. It is a land-grant university and a member of the 
# Thurgood Marshall Fund. UDC is an HBCU with an open admissions policy.

print(public_vs_private.isnull().sum())
public_vs_private[public_vs_private.isnull().any(axis=1)]

public_or_private
private                    2
public                     1
private_gr_gt_public_gr    0
dtype: int64


Unnamed: 0_level_0,public_or_private,private,public,private_gr_gt_public_gr
state_name,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
District of Columbia,South,75.5,,False
Hawaii,West,,42.5,False
Wyoming,West,,45.0,False


In [68]:
# Filter out the three states with a NaN in either `private` or `public`
public_vs_private[(public_vs_private['private_gr_gt_public_gr'] == False) \
                  & (public_vs_private.notnull().all(axis=1))]

# 3 West and 3 Midwest according the Region breakdown selected. Egalitarian AF or what?
# Question: Was the chosen option the best region breakdown option? Remains to be seen.

Unnamed: 0_level_0,public_or_private,private,public,private_gr_gt_public_gr
state_name,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alaska,West,15.0,39.0,False
Iowa,Midwest,63.2,68.0,False
Michigan,Midwest,55.85,58.27,False
Nevada,West,29.0,45.5,False
South Dakota,Midwest,51.4,51.75,False
Utah,West,41.5,44.67,False


In [14]:
# Grouping and analyzing graduation rates by region
unis.groupby('region')['graduation_rate'].mean()\
.round(2).sort_values(ascending=False).reset_index()

Unnamed: 0,region,graduation_rate
0,Northeast,69.71
1,Midwest,60.95
2,West,55.59
3,South,54.52


In [27]:
# Add in public_or_private

unis.groupby(['region', 'public_or_private'])['graduation_rate'].mean()\
.round(2).sort_values(ascending=False).reset_index()

# unis.groupby(['region', 'public_or_private'])['graduation_rate'].agg(['mean', 'median'])\
# .round(2).reset_index()

Unnamed: 0,region,public_or_private,graduation_rate
0,Northeast,private,74.76
1,Midwest,private,65.1
2,West,private,61.8
3,South,private,60.17
4,Northeast,public,55.7
5,Midwest,public,52.14
6,West,public,49.47
7,South,public,46.56


In [17]:
unis.groupby(['region', 'public_or_private'])\
['graduation_rate'].mean()\
.round(2).sort_values(ascending=False).reset_index()

Unnamed: 0,region,public_or_private,graduation_rate
0,Northeast,private,74.76
1,Midwest,private,65.1
2,West,private,61.8
3,South,private,60.17
4,Northeast,public,55.7
5,Midwest,public,52.14
6,West,public,49.47
7,South,public,46.56


In [22]:
unis.groupby(['region', 'public_or_private'])\
[['graduation_rate', 'percentage_applications_accepted']].agg(['mean', 'median'])\
.round(2).reset_index()

# .mean()
# .agg(['mean', 'median'])

Unnamed: 0_level_0,region,public_or_private,graduation_rate,graduation_rate,percentage_applications_accepted,percentage_applications_accepted
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,mean,median
0,Midwest,private,65.1,67.0,80.71,82.9
1,Midwest,public,52.14,52.0,82.97,83.34
2,Northeast,private,74.76,75.0,71.11,76.25
3,Northeast,public,55.7,56.0,64.33,65.72
4,South,private,60.17,60.0,76.02,79.34
5,South,public,46.56,45.0,76.13,75.57
6,West,private,61.8,64.5,72.92,76.84
7,West,public,49.47,48.0,75.01,77.69


*---------------------------------------------------------------------------------------------------------------------------------------------*