In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('NCWIT_DataV2_RawData_Edited.csv')

  interactivity=interactivity, compiler=compiler, result=result)


The following function takes an attribute(attr) and a directory(loc), splits the data based on the attr and saves csv files based on the split to the directory passed in.

#### Viz 1

Differences in retention depending on when students declare their majors?

> Attributes: school year, When do students typically declare their major?, new enrollments(male & female), transfer students, left institution,  
> When do students typically declare their major?


In [8]:
declerations = splitData("When do students typically declare their major?", df)
retention = Retention()
for declerationType in declerations.attrValues:
    retention.addDeclerationType(declerationType, DeclerationTypeBlock(declerationType, splitData("School Year", declerations.attrDict[declerationType]).getAttrs()))

In [9]:
# retention.getDeclerationType("Upon Enrollment").buildRetentionDataFrame()

In [10]:
retention.to_csv("./processed-data/retention/")

### Helper Functions and Classes

In [11]:
class splitData(object):
    '''
    Params: attribute name and a dataframe 
    Variables:
        @attrValues: All possible attribute values based on given attribute
        @splitDict: Dictionary with an entry containing each attrValue and its corresponding dataframe
    '''
    def __init__(self, attr, df):
        self.df = df
        self.attr = attr
        self.attrValues = self.setAttrValues()
        self.attrDict = self.splitDataByAttrToDict()
    def setAttrValues(self):
        values = set()
        for val in self.df[self.attr]:
            values.add(val)
        return values
    def splitDataByAttrToDict(self):
        splitDict = {}
        for val in self.attrValues:
            splitDict[val] = self.df.loc[self.df[self.attr] == val]
        return splitDict
    def getAttr(self, attrName):
        if attrName not in self.attrDict:
            print("Attribute not found!")
            return
        return self.attrDict[attrName]
    def getAttrs(self):
        return self.attrDict
    def toCsv(self, loc):
        for val in self.attrValues:
            self.attrDict[val].to_csv(loc + str(val) + '.csv', index=False)

In [4]:
class Retention(object):
    def __init__(self):
        self.declerationTypes = {}
    def addDeclerationType(self, declerationType, df):
        self.declerationTypes[declerationType] = df
    def getDeclerationType(self, declerationType):
        if declerationType not in self.declerationTypes:
            print("Decleration type not found!")
            return
        return self.declerationTypes[declerationType]
    def to_csv(self, loc):
        for declerationType in self.declerationTypes:
            self.declerationTypes[declerationType].buildRetentionDataFrame().to_csv(loc + str(declerationType) + '.csv', index=False)

In [5]:
class DeclerationTypeBlock(object):
    def __init__(self, declerationType, df):
        self.declerationType = declerationType
        self.years = self.buildYearsDict(df)
    def buildYearsDict(self, df):
        years = {}
        for year in df:
            years[year] = yearBlock(self.declerationType, year, df[year])
        return years
    def getYear(self, year):
        if year not in self.years:
            print("The option for declared type not found!")
            return
        return self.years[year]
    def buildCalculationsDataFrame(self):
        retDictList = []
        for year in self.years:
            retDictList.append(self.years[year].buildRetentionDict())
        return pd.DataFrame(retDictList)
    def buildRetentionDataFrame(self):
        years = sorted(list(self.years.keys()))
        retDictList = [] 
        for i in range(0, len(years)-1):
            retDictList.append({"year": years[i+1],
                                "FemaleRetentionRate": calculateRetention(self.years[years[i+1]].buildRetentionDict(), self.years[years[i]].buildRetentionDict(), "Female"),
                                "MaleRetentionRate": calculateRetention(self.years[years[i+1]].buildRetentionDict(), self.years[years[i]].buildRetentionDict(), "Male")})
        return pd.DataFrame(retDictList)

In [6]:
class yearBlock(object):
    def __init__(self, declerationType, year, df):
        self.declerationType = declerationType
        self.year = year
        self.df = df
        self.grades = ["Freshmen", "Sophomores", "Juniors", "Seniors", "5th Yr Seniors"]
        self.races = ["Asian", "Black/African American", "Hispanics of any race", "American Indian/Alaska Native", "Native Hawaiian/OtherPacific Islander", "Two or more races", "White"]
    def getDataFrame(self):
        return self.df
    def getCountByGradeAndGender(self, grade, gender):
        count = 0
        for race in self.races:
            count += self.df[grade + ", " + gender + ": " + race + " (" + self.getGradeAbb(grade) + " " + self.getGenderAbb(gender) + ")"].sum()
        return count
    def getCountByGrade(self, grade):
        return self.getCountByGradeAndGender(grade, "Female") + self.getCountByGradeAndGender(grade, "Male")
    def getCountByGender(self, gender):
        count = 0
        for race in self.races:
            count += self.df["Totals, " + gender + ": " + race + " (Tot. " + self.getGenderAbb(gender) + ")"].sum()
        return count
    def getNewEnrollmentCount(self, gender):
        return self.df["Enroll, " + gender + ": New Enrollments (Enrl " + self.getGenderAbb(gender) + ")"].sum()
    def getTransferCount(self, gender):
        return self.df["Enroll, " + gender + ": Transfer Students (Enrl " + self.getGenderAbb(gender) + ")"].sum()
    def getGenderAbb(self, gender):
        return "F" if gender == "Female" else "M"
    def getGradeAbb(self, grade):
        return "Fshm" if grade == "Freshmen" else "Sph." if grade == "Sophomores" else "Jun." if grade == "Juniors" else "Sen." if grade == "Seniors" else "5 Sen"
    def buildRetentionDict(self):
        ret = {"School Year": self.year, "When do students typically declare their major?": self.declerationType}
        ret["Female, New Enrollments"] = self.getNewEnrollmentCount("Female")
        ret["Male, New Enrollments"] = self.getNewEnrollmentCount("Male")
        ret["Female, Transfer"] = self.getTransferCount("Female")
        ret["Male, Transfer"] = self.getTransferCount("Male")
        ret["Total, Female"] = self.getCountByGender("Female")
        ret["Total, Male"] = self.getCountByGender("Male")
        return ret

In [7]:
def calculateRetention(y1, y2, gender=None):
    totYear1 = y1["Total, "+gender]
    totYear2 = (y2["Total, "+gender] - y2[gender+", New Enrollments"] - y2[gender+", Transfer"])
    diff = totYear2 - totYear1
    change = totYear1 + diff
    return 0 if totYear1 == 0 else 1 if (change/totYear1) >= 1 else (change/totYear1)