In [1]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv('NCWIT_DataV2_RawData_Edited.csv')

  interactivity=interactivity, compiler=compiler, result=result)


The following function takes an attribute(attr) and a directory(loc), splits the data based on the attr and saves csv files based on the split to the directory passed in.

#### Viz 1

Differences in retention depending on when students declare their majors?

> Attributes: school year, When do students typically declare their major?, new enrollments(male & female), transfer students, left institution,  
> When do students typically declare their major?


In [7]:
declerations = splitData("When do students typically declare their major?", df)
retention = Retention()
for declerationType in declerations.attrValues:
    retention.addDeclerationType(declerationType, DeclerationTypeBlock(declerationType, splitData("School Year", declerations.attrDict[declerationType]).getAttrs()))

In [8]:
retention.to_csv("./processed-data/retention/")

### Helper Functions and Classes

In [2]:
class splitData(object):
    '''
    Params: attribute name and a dataframe 
    Variables:
        @attrValues: All possible attribute values based on given attribute
        @splitDict: Dictionary with an entry containing each attrValue and its corresponding dataframe
    '''
    def __init__(self, attr, df):
        self.df = df
        self.attr = attr
        self.attrValues = self.setAttrValues()
        self.attrDict = self.splitDataByAttrToDict()
    def setAttrValues(self):
        values = set()
        for val in self.df[self.attr]:
            values.add(val)
        return values
    def splitDataByAttrToDict(self):
        splitDict = {}
        for val in self.attrValues:
            splitDict[val] = self.df.loc[self.df[self.attr] == val]
        return splitDict
    def getAttr(self, attrName):
        if attrName not in self.attrDict:
            print("Attribute not found!")
            return
        return self.attrDict[attrName]
    def getAttrs(self):
        return self.attrDict
    def toCsv(self, loc):
        for val in self.attrValues:
            self.attrDict[val].to_csv(loc + str(val) + '.csv')

In [3]:
class Retention(object):
    def __init__(self):
        self.declerationTypes = {}
    def addDeclerationType(self, declerationType, df):
        self.declerationTypes[declerationType] = df
    def getdeclerationType(self, declerationType):
        if declerationType not in self.declerationTypes:
            print("Decleration type not found!")
            return
        return self.declerationTypes[declerationType]
    def to_csv(self, loc):
        for declerationType in self.declerationTypes:
            self.declerationTypes[declerationType].buildRetentionDataFrame().to_csv(loc + str(declerationType) + '.csv')

In [4]:
class DeclerationTypeBlock(object):
    def __init__(self, declerationType, df):
        self.declerationType = declerationType
        self.years = self.buildYearsDict(df)
    def buildYearsDict(self, df):
        years = {}
        for year in df:
            years[year] = yearBlock(self.declerationType, year, df[year])
        return years
    def getYear(self, year):
        if year not in self.years:
            print("The option for declared type not found!")
            return
        return self.years[year]
    def buildRetentionDataFrame(self):
        retDictList = []
        for year in self.years:
            retDictList.append(self.years[year].buildRetentionDict())
        return pd.DataFrame(retDictList)

In [5]:
class yearBlock(object):
    def __init__(self, declerationType, year, df):
        self.declerationType = declerationType
        self.year = year
        self.df = df
        self.grades = ["Freshmen", "Sophomores", "Juniors", "Seniors", "5th Yr Seniors"]
        self.races = ["Asian", "Black/African American", "Hispanics of any race", "American Indian/Alaska Native", "Native Hawaiian/OtherPacific Islander", "Two or more races", "White"]
    def getDataFrame(self):
        return self.df
    def getCountByGradeAndGender(self, grade, gender):
        gradeAbv = "Fshm" if grade == "Freshmen" else "Sph." if grade == "Sophomores" else "Jun." if grade == "Juniors" else "Sen." if grade == "Seniors" else "5 Sen"
        genderAbv = "F" if gender == "Female" else "M"
        count = 0
        for race in self.races:
            count += self.df[grade + ", " + gender + ": " + race + " (" + gradeAbv + " " + genderAbv + ")"].sum()
        return count
    def getCountByGender(self, gender):
        count = 0
        for grade in self.grades:
            count += self.getCountByGradeAndGender(grade, gender)
        return count
    def getCountByGrade(self, grade):
        return self.getCountByGradeAndGender(grade, "Female") + self.getCountByGradeAndGender(grade, "Male")
    def getLeftCountByGradeAndGender(self, grade, gender):
        gradeAbv = "Fshm" if grade == "Freshmen" else "Sph." if grade == "Sophomores" else "Jun." if grade == "Juniors" else "Sen." if grade == "Seniors" else "5 Sen"
        genderAbv = "F" if gender == "Female" else "M"
        count = self.df[grade + ", " + gender + ": " + "Left Institution (not graduated)" + " (" + gradeAbv + " " + genderAbv + ")"].sum()
        return count
    def getLeftCountByGender(self, gender):
        count = 0
        for grade in self.grades:
            count += self.getLeftCountByGradeAndGender(grade, gender)
        return count
    def getLeftCountByGrade(self, grade):
        return self.getLeftCountByGradeAndGender(grade, "Female") + self.getLeftCountByGradeAndGender(grade, "Male")
    def getGradCountByGradeAndGender(self, grade, gender):
        gradeAbv = "Fshm" if grade == "Freshmen" else "Sph." if grade == "Sophomores" else "Jun." if grade == "Juniors" else "Sen." if grade == "Seniors" else "5 Sen"
        genderAbv = "F" if gender == "Female" else "M"
        count = self.df[grade + ", " + gender + ": " + "Graduated" + " (" + gradeAbv + " " + genderAbv + ")"].sum()
        return count
    def getGradCountByGender(self, gender):
        count = 0
        for grade in self.grades:
            count += self.getGradCountByGradeAndGender(grade, gender)
        return count
    def getGradCountByGrade(self, grade):
        return self.getGradCountByGradeAndGender(grade, "Female") + self.getGradCountByGradeAndGender(grade, "Male")
    def buildRetentionDict(self):
        ret = {"School Year": self.year, "When do students typically declare their major?": self.declerationType}
        for grade in self.grades:
            ret[grade+", Female"] = self.getCountByGradeAndGender(grade, "Female")
            ret[grade+", Male"] = self.getCountByGradeAndGender(grade, "Male")
            ret[grade+", Female: Left Institution"] = self.getLeftCountByGradeAndGender(grade, "Female")
            ret[grade+", Male: Left Institution"] = self.getLeftCountByGradeAndGender(grade, "Male")
            ret[grade+", Female: Graduated"] = self.getGradCountByGradeAndGender(grade, "Female")
            ret[grade+", Male: Graduated"] = self.getGradCountByGradeAndGender(grade, "Male")
            ret["Total, "+grade + ": Left Institution"] = self.getLeftCountByGrade(grade)
            ret["Total, "+grade + ": Graduated"] = self.getGradCountByGrade(grade)
            ret["Total, "+grade] = self.getCountByGrade(grade)
        ret["Total, Female"] = self.getCountByGender("Female")
        ret["Total, Male"] = self.getCountByGender("Male")
        ret["Total, Female: Left Institution"] = self.getLeftCountByGender("Female")
        ret["Total, Male: Left Institution"] = self.getLeftCountByGender("Male")
        ret["Total, Female: Graduated"] = self.getLeftCountByGender("Female")
        ret["Total, Male: Graduated"] = self.getGradCountByGender("Male")
        
        return ret