In [80]:
import pandas as pd
import os
import fnmatch
from library import start
from library import clean_for_merge

In [81]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [82]:
teacher_datapath = os.path.join(start.data_path, 'tea', 'teachers', 'certification1617/' )
teacher_datapath

'/Users/kylieleblancKylie/domino/dofis/data/tea/teachers/certification1617/'

In [83]:
teachers_varlist = ['DISTRICT NUMBER', 'DISTRICT NAME', 'SCRAMBLED UNIQUE ID',
                    'SUBJECT AREA NAME 1','SUBJECT AREA NAME 2', 'SUBJECT AREA NAME 3' ]
cert_varlist = ['PERSONID_SCRAM', 'DISTRICT', 'CREDENTIAL_TYPE', 'CERTIFICATION_LEVEL',
               'CREDENTIALED_GRADES', 'SUBJECT_AREA', 'ROLE_CREDENTIALED_FOR',
               'POPULATION_CREDENTIALED_FOR']

# Certfication

## Append all certification datasets from 2016-17

In [84]:
pattern = "CERTIFICATION*.csv"
cert_files = []
for entry in os.listdir(teacher_datapath)  :  
    if fnmatch.fnmatch(entry, pattern):
            cert_files.append(entry)
cert_files.sort()
dirs_cert = [teacher_datapath + file for file in cert_files]
df_list = [pd.read_csv(file, sep=",", encoding = "ISO-8859-1", dtype = object) for file in dirs_cert]
certification = pd.concat(df_list)

In [85]:
certification = certification.sort_values(by = 'PERSONID_SCRAM')
print('number of certifications: ', len(certification))
certification.head()

number of certifications:  2302505


Unnamed: 0,PERSONID_SCRAM,DISTRICT,REGION,CERTIFICATE_ID_NUMBER,CERTIFICATE_LIFE,CREDENTIAL_TYPE,CERTIFICATE_PREPARATION_ROUTE,CERTIFICATE_EFFECTIVE_DATE,CERTIFICATE_EXPIRATION_DATE,SUBJECT_AREA,SUBJECT,FULLER_CERTIFICATE_DESCRIPTION,NUMBER_OF_RENEWAL_CYCLES,CREDENTIALED_GRADES,CERTIFICATION_LEVEL,ROLE_CREDENTIALED_FOR,POPULATION_CREDENTIALED_FOR,PREPARATION_PROGRAM,EMAIL_ADDRESS
124646,*30*0**45,101902,4,2025511,LIFE,Paraprofessional,Paraprofessional Program,10FEB1997:00:00:00,,Other,Not Applicable,Educational Aide II,,,Not Applicable,Educational Aide,Not Applicable,ALDINE ISD,gcollins2@aldine.k12.tx.us
243278,*30*0*045,57912,10,3323797,5YR,Standard Professional,Standard Program,11AUG2007:00:00:00,31MAR2009:00:00:00,Professional,Principal,Principal,1.0,Grades EC-12,Professional Service,Principal,Not Applicable,UNIVERSITY OF TEXAS - ARLINGTON,
243283,*30*0*045,57912,10,4862369,5YR,Standard,Certification by Exam,01APR2015:00:00:00,31MAR2021:00:00:00,Special Education,Generic Special Education,Generic Special Education,3.0,,Endorsement,Teacher,Special Education,STATE BOARD FOR EDUCATOR CERTIFICATION,
243280,*30*0*045,57912,10,2770534,5YR,Standard,Certification by Exam,08DEC2003:00:00:00,31MAR2009:00:00:00,Special Education,Generic Special Education,Generic Special Education,1.0,,Endorsement,Teacher,Special Education,STATE BOARD FOR EDUCATOR CERTIFICATION,
243281,*30*0*045,57912,10,3567548,5YR,Standard,Certification by Exam,01APR2009:00:00:00,31MAR2015:00:00:00,Special Education,Generic Special Education,Generic Special Education,2.0,,Endorsement,Teacher,Special Education,STATE BOARD FOR EDUCATOR CERTIFICATION,


# Keep only latest certification of duplicates

In [87]:
certification['expiration'] = pd.to_datetime(certification.CERTIFICATE_EXPIRATION_DATE.str.slice(0, 9))

In [88]:
cert_vars = ['PERSONID_SCRAM', 'DISTRICT', 
             'CERTIFICATION_LEVEL', 'SUBJECT', 'CREDENTIALED_GRADES','POPULATION_CREDENTIALED_FOR',
             'CREDENTIAL_TYPE', 'CERTIFICATION_LEVEL', 'CERTIFICATE_PREPARATION_ROUTE',
             'CERTIFICATE_LIFE', 'expiration']
certification = certification.sort_values(by = cert_vars, ascending = True)
cert_vars.remove('expiration')
certification = certification.drop_duplicates(subset = cert_vars, keep = 'last')

Unnamed: 0,PERSONID_SCRAM,DISTRICT,REGION,CERTIFICATE_ID_NUMBER,CERTIFICATE_LIFE,CREDENTIAL_TYPE,CERTIFICATE_PREPARATION_ROUTE,CERTIFICATE_EFFECTIVE_DATE,CERTIFICATE_EXPIRATION_DATE,SUBJECT_AREA,SUBJECT,FULLER_CERTIFICATE_DESCRIPTION,NUMBER_OF_RENEWAL_CYCLES,CREDENTIALED_GRADES,CERTIFICATION_LEVEL,ROLE_CREDENTIALED_FOR,POPULATION_CREDENTIALED_FOR,PREPARATION_PROGRAM,EMAIL_ADDRESS,expiration
124646,*30*0**45,101902,4,2025511,LIFE,Paraprofessional,Paraprofessional Program,10FEB1997:00:00:00,,Other,Not Applicable,Educational Aide II,,,Not Applicable,Educational Aide,Not Applicable,ALDINE ISD,gcollins2@aldine.k12.tx.us,NaT
243275,*30*0*045,57912,10,1888828,SYR,Emergency Non-Certified,Permit Program,10AUG1998:00:00:00,31AUG1999:00:00:00,Bilingual Education,Bilingual Spanish,Elementary Bilingual-Spanish,,Grades 1-6,Elementary,Teacher,Bilingual Students,TYLER ISD,,1999-08-31
243272,*30*0*045,57912,10,1748220,LIFE,Provisional,Standard Program,20FEB1999:00:00:00,,English Language Arts,English,Elementary English,,Grades 1-8,Elementary,Teacher,Regular Students,UNIVERSITY OF TEXAS - TYLER,,NaT
243273,*30*0*045,57912,10,1748221,LIFE,Provisional,Standard Program,20FEB1999:00:00:00,,General Elementary (Self-Contained),Self-Contained,Elementary Self-Contained,,Grades 1-8,Elementary,Teacher,Regular Students,UNIVERSITY OF TEXAS - TYLER,,NaT
243274,*30*0*045,57912,10,1748222,LIFE,Provisional,Standard Program,20FEB1999:00:00:00,,Foreign Language,Spanish,Elementary Spanish,,Grades 1-8,Elementary,Teacher,Regular Students,UNIVERSITY OF TEXAS - TYLER,,NaT
243279,*30*0*045,57912,10,1779383,LIFE,Provisional,Certification by Exam,14JUN1999:00:00:00,,Bilingual Education,Bilingual Spanish,Bilingual/ESL-Spanish,,,Endorsement,Teacher,Bilingual Students,STATE BOARD FOR EDUCATOR CERTIFICATION,,NaT
243280,*30*0*045,57912,10,2770534,5YR,Standard,Certification by Exam,08DEC2003:00:00:00,31MAR2009:00:00:00,Special Education,Generic Special Education,Generic Special Education,1.0,,Endorsement,Teacher,Special Education,STATE BOARD FOR EDUCATOR CERTIFICATION,,2009-03-31
243281,*30*0*045,57912,10,3567548,5YR,Standard,Certification by Exam,01APR2009:00:00:00,31MAR2015:00:00:00,Special Education,Generic Special Education,Generic Special Education,2.0,,Endorsement,Teacher,Special Education,STATE BOARD FOR EDUCATOR CERTIFICATION,,2015-03-31
243283,*30*0*045,57912,10,4862369,5YR,Standard,Certification by Exam,01APR2015:00:00:00,31MAR2021:00:00:00,Special Education,Generic Special Education,Generic Special Education,3.0,,Endorsement,Teacher,Special Education,STATE BOARD FOR EDUCATOR CERTIFICATION,,2021-03-31
243276,*30*0*045,57912,10,2042153,LIFE,Paraprofessional,Paraprofessional Program,15AUG1994:00:00:00,,Other,Not Applicable,Educational Secretary I,,,Not Applicable,Educational Secretary,Not Applicable,TYLER ISD,,NaT


## Reshape
Need to reshape for merge so that each teacher is a single row. 

In [94]:
# create count within each scrabled id
certification['idx'] = certification.groupby('PERSONID_SCRAM').cumcount()
print('some teachers have as many as ', certification.idx.max(), 'current certifications')
print(certification[certification.idx > 10].PERSONID_SCRAM.nunique())

some teachers have as many as  4 current certifications
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [113]:
certification_wide = certification.pivot(index='PERSONID_SCRAM',columns='idx')[['ROLE_CREDENTIALED_FOR', 'CREDENTIAL_TYPE','CREDENTIALED_GRADES', 'SUBJECT']]

In [114]:
cols = certification_wide.columns
ind = pd.Index([e[0] + str(e[1]) for e in cols.tolist()])
certification_wide.columns = ind
certification_wide

Unnamed: 0_level_0,CERTIFICATION_LEVEL0,CERTIFICATION_LEVEL1,CERTIFICATION_LEVEL2,CERTIFICATION_LEVEL3,CERTIFICATION_LEVEL4,CREDENTIAL_TYPE0,CREDENTIAL_TYPE1,CREDENTIAL_TYPE2,CREDENTIAL_TYPE3,CREDENTIAL_TYPE4,CREDENTIALED_GRADES0,CREDENTIALED_GRADES1,CREDENTIALED_GRADES2,CREDENTIALED_GRADES3,CREDENTIALED_GRADES4,SUBJECT0,SUBJECT1,SUBJECT2,SUBJECT3,SUBJECT4
PERSONID_SCRAM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
*30*0*045,Endorsement,Not Applicable,Professional Service,,,Standard,Paraprofessional,Standard,,,,,Grades EC-12,,,Generic Special Education,Not Applicable,Principal,,
*30*0LP36,Elementary,Supplemental,,,,Standard,Standard,,,,Grades 4-8,,,,,Generalist,English as a Second Language,,,
*30*0V*46,Not Applicable,,,,,Paraprofessional,,,,,,,,,,Not Applicable,,,,
*30*1*440,Not Applicable,,,,,Standard Paraprofessional,,,,,,,,,,Not Applicable,,,,
*30*10*41,Not Applicable,,,,,Paraprofessional,,,,,,,,,,Not Applicable,,,,
*30*11241,All Level,Elementary,Not Applicable,,,Standard,Standard,Standard Paraprofessional,,,Grades PK-12,Grades 4-8,,,,Physical Education,Generalist,Not Applicable,,
*30*11245,All Level,Elementary,Elementary,Secondary,Special Education,Standard,Standard,Standard,Standard,Standard,Grades EC-12,Grades 4-8,Grades EC-4,Grades 8-12,Grades EC-12,Health Education,Generalist,Generalist,English Language Arts and Reading,Generic Special Education
*30*11346,Not Applicable,,,,,Paraprofessional,,,,,,,,,,Not Applicable,,,,
*30*1F241,Not Applicable,,,,,Paraprofessional,,,,,,,,,,Not Applicable,,,,
*30*1FV49,Elementary,Elementary,,,,Standard,Standard,,,,Grades 1-8,Grades 1-8,,,,Reading,Self-Contained,,,


# Teachers

In [11]:
pattern = "TEACHER_MASTER*.TXT"
teacher_files = []
for entry in os.listdir(teacher_datapath)  :  
    if fnmatch.fnmatch(entry, pattern):
            teacher_files.append(entry)
teacher_files.sort()
dirs_teachers = [teacher_datapath + file for file in teacher_files]

In [12]:
df_list = [pd.read_csv(file, sep=",", encoding = "ISO-8859-1", dtype = object) for file in dirs_teachers]
teachers = pd.concat(df_list)
teachers = teachers[teachers_varlist]
print('length: ', len(teachers))
teachers.head()

length:  363174


Unnamed: 0,DISTRICT NUMBER,DISTRICT NAME,SCRAMBLED UNIQUE ID,SUBJECT AREA NAME 1,SUBJECT AREA NAME 2,SUBJECT AREA NAME 3
0,31901,BROWNSVILLE ISD,*30*5P341,SELF-CONTAINED,ENGLISH LANGUAGE ARTS,MATHEMATICS
1,31901,BROWNSVILLE ISD,*3034L040,CAREER & TECHNOLOGY EDUCATION,,
2,31901,BROWNSVILLE ISD,*3040V545,ENGLISH LANGUAGE ARTS,MATHEMATICS,SCIENCE
3,31901,BROWNSVILLE ISD,*3106V240,OTHER,,
4,31901,BROWNSVILLE ISD,*31271Q42,MATHEMATICS,SOCIAL STUDIES,PHYSICAL ED. & HEALTH


In [18]:
teachers = teachers.set_index('SCRAMBLED UNIQUE ID')

# Merge

In [21]:
data = teachers.merge(certification, how = 'left',
                       left_index = True, right_index = True, indicator = True)
len(data)

1737286

In [22]:
data.head(50)

Unnamed: 0,DISTRICT NUMBER,DISTRICT NAME,SUBJECT AREA NAME 1,SUBJECT AREA NAME 2,SUBJECT AREA NAME 3,DISTRICT,CREDENTIAL_TYPE,CERTIFICATION_LEVEL,CREDENTIALED_GRADES,SUBJECT_AREA,ROLE_CREDENTIALED_FOR,POPULATION_CREDENTIALED_FOR,_merge
*30*00034,61902,LEWISVILLE ISD,MATHEMATICS,PHYSICAL ED. & HEALTH,,61902,Standard,Elementary,Grades 4-8,Mathematics,Teacher,Regular Students,both
*30*00034,61902,LEWISVILLE ISD,MATHEMATICS,PHYSICAL ED. & HEALTH,,61902,Standard,Supplemental,,Bilingual Education,Teacher,ESL Students,both
*30*00034,61902,LEWISVILLE ISD,MATHEMATICS,PHYSICAL ED. & HEALTH,,61902,Standard,Elementary,Grades EC-6,General Elementary (Self-Contained),Teacher,Regular Students,both
*30*00034,61902,LEWISVILLE ISD,MATHEMATICS,PHYSICAL ED. & HEALTH,,61902,Standard,Secondary,Grades 7-12,Mathematics,Teacher,Regular Students,both
*30*0F443,101903,ALIEF ISD,ENGLISH LANGUAGE ARTS,SOCIAL STUDIES,FOREIGN LANGUAGE,101903,Probationary,Elementary,Grades PK-6,Bilingual Education,Teacher,Bilingual Students,both
*30*0F443,101903,ALIEF ISD,ENGLISH LANGUAGE ARTS,SOCIAL STUDIES,FOREIGN LANGUAGE,101903,Provisional,Elementary,Grades PK-6,Bilingual Education,Teacher,Bilingual Students,both
*30*0LP36,15910,NORTH EAST ISD,ENGLISH LANGUAGE ARTS,,,15910,Probationary,Elementary,Grades 4-8,General Elementary (Self-Contained),Teacher,Regular Students,both
*30*0LP36,15910,NORTH EAST ISD,ENGLISH LANGUAGE ARTS,,,15910,Probationary Extension,Elementary,Grades 4-8,General Elementary (Self-Contained),Teacher,Regular Students,both
*30*0LP36,15910,NORTH EAST ISD,ENGLISH LANGUAGE ARTS,,,15910,Standard,Elementary,Grades 4-8,General Elementary (Self-Contained),Teacher,Regular Students,both
*30*0LP36,15910,NORTH EAST ISD,ENGLISH LANGUAGE ARTS,,,15910,Standard,Supplemental,,Bilingual Education,Teacher,ESL Students,both


In [24]:
len(data[data._merge == 'left_only'])

8913

In [15]:
len(teachers)

363174

In [None]:
#data[data['_merge'] == 'both']
data[data['DISTRICT NAME'] == 'BROWNSVILLE ISD']

In [None]:
data[data['DISTRICT NAME'] == 'LEANDER ISD']

In [None]:
data[data['DISTRICT'] == ]

In [None]:
len(teachers)

In [None]:
len(certification)