In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read CSV

#Note: Student personal data has been redacted

df = pd.read_csv("Datasets\PTK-Names-31824.csv")

df['ID'], _ = pd.factorize(df['ID'])

In [3]:
#Initial check of dataframe
df

Unnamed: 0,ID,First Name,Last Name,State,Country,Allow Calls or Text,Phi Theta Kappa Member,Ethnicity,Date of Birth,Gender,...,Expected Graduation Date,Enrollment Level,Acct Creation Date,Chapter Officer,Regional Officer,Intl Officer,Favorited,Downloaded,Viewed,Recruited Date
0,0,Student 1 First Name,Student 1 Last Name,PA,USA,Y,Y,Other,4/3/1986,Female,...,5/1/2024,FT,4/14/2022,N,N,N,N,Y,N,3/15/2024
1,1,Student 2 First Name,Student 2 Last Name,PA,USA,Y,Y,Black/African American,1/29/2002,Female,...,5/1/2024,FT,6/2/2021,N,N,N,N,Y,N,3/15/2024
2,2,Student 3 First Name,Student 3 Last Name,NJ,USA,Y,Y,More than one race,5/11/2006,Female,...,5/1/2024,FT,9/28/2023,N,N,N,N,Y,N,11/10/2023
3,3,Student 4 First Name,Student 4 Last Name,MA,USA,Y,Y,More than one race,2/12/2002,Male,...,5/1/2024,FT,4/3/2023,N,N,N,N,Y,N,11/10/2023
4,4,Student 5 First Name,Student 5 Last Name,PA,USA,Y,Y,White,1/21/2002,Male,...,5/1/2024,PT,10/25/2023,N,N,N,N,Y,N,3/15/2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5483,5483,Student 5484 First Name,Student 5484 Last Name,NJ,USA,Y,Y,I prefer not to answer,7/14/2004,Female,...,12/31/2024,FT,1/30/2023,N,N,N,N,Y,N,3/15/2024
5484,5484,Student 5485 First Name,Student 5485 Last Name,PA,USA,Y,Y,White,10/4/2003,Male,...,12/31/2024,PT,9/23/2023,N,N,N,N,Y,N,3/15/2024
5485,5485,Student 5486 First Name,Student 5486 Last Name,NJ,USA,Y,Y,White,1/11/1999,Male,...,12/31/2024,FT,5/18/2023,N,N,N,N,Y,N,3/15/2024
5486,5486,Student 5487 First Name,Student 5487 Last Name,NJ,USA,Y,Y,Black/African American,11/14/2003,Female,...,12/31/2024,FT,1/25/2024,N,N,N,N,Y,N,3/15/2024


In [4]:
#Get info on dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5488 entries, 0 to 5487
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        5488 non-null   int64  
 1   First Name                5488 non-null   object 
 2   Last Name                 5488 non-null   object 
 3   State                     5486 non-null   object 
 4   Country                   5488 non-null   object 
 5   Allow Calls or Text       5185 non-null   object 
 6   Phi Theta Kappa Member    5488 non-null   object 
 7   Ethnicity                 5427 non-null   object 
 8   Date of Birth             5419 non-null   object 
 9   Gender                    5424 non-null   object 
 10  Veteran                   5318 non-null   object 
 11  US Citizen                5488 non-null   object 
 12  Pell Grant                5210 non-null   object 
 13  Dually Enrolled           1115 non-null   object 
 14  Fully On

In [5]:
#Check shape
df.shape


(5488, 34)

In [6]:
#Drop unneeded columns
df = df.drop(columns=["Acct Creation Date", "Veteran", "US Citizen","ID", "Phi Theta Kappa Member", "Chapter Officer", "Current Campus", "Intl Edu Interest", "Regional Officer", "Pell Grant", "Dually Enrolled", "Country", "Fully Online Interest", "Grad School Interest", "Intl Officer", "Favorited", "Downloaded", "Viewed", "Recruited Date", "Current Major Code", "IPEDS ID"])

In [7]:
#Rename columns to match CRM naming convention
df = df.rename(columns={'City': 'Address City', 'State': 'Address State', 'Zip Code':'Address Zip', 'Allow Calls or Text':'Text Permission','Date of Birth':'Birthdate', "Current Major":'Area of Interest 1', 'Current College':'School Name', 'Expected Graduation Date':'Graduation Year', 'Enrollment Level':'Admit Type', 'CEEB_CODE':'CEEB'})

In [8]:
#Convert birthdate to datetime
df['Birthdate'] = pd.to_datetime(df['Birthdate'], errors='coerce').dt.date

In [9]:
#Convert grad year to datetime
df['Graduation Year'] = pd.to_datetime(df['Graduation Year'])

In [10]:
#second dtype check
df.dtypes

First Name                    object
Last Name                     object
Address State                 object
Text Permission               object
Ethnicity                     object
Birthdate                     object
Gender                        object
GPA                          float64
Area of Interest 1            object
School Name                   object
CEEB                         float64
Graduation Year       datetime64[ns]
Admit Type                    object
dtype: object

In [11]:
import difflib

#Converting academic interests to University-offered majors to allow for import and internal data analysis

# List of academic interests
academic_interests = [
    'Biology, General', 'Psychology', 'Game and Interactive Media Design', 
    'Liberal Arts and Science', 'Biology/Biological Sciences, General', 
    'Liberal Arts & Sciences, Gen Studies & Humanities', 'Social Work', 
    'Design and Applied Arts', 'Allied Health and Medical Assisting Services', 
    'Radiologic Technology/Science - Radiographer', 'Communication, General', 
    'Human Services, General', 'Speech-Language Pathology/Pathologist', 
    'Adult Health Nurse/Nursing', 'Personality Psychology', 
    'Animation, Interactive Technology, Video Graphics', 'Fine and Studio Arts', 
    'Fashion/Apparel Design', 'General Studies', 
    'Computer & Information Sciences & Support Services', 
    'Business/Office Automation/Technology/Data Entry', 'Finance, General', 
    'Computer and Information Systems Security', 
    'Health Services/Allied Health/Health Sciences, Gen', 'Computer Science', 
    'Pre-Dentistry Studies', 'Business Administration, Management and Operations', 
    'Veterinary/Animal Health', 'Public Health', 
    'Criminal Justice/Law Enforcement Administration', 
    'Registered Nursing/Registered Nurse', 'American  History (United States)', 
    'Biochemistry', 'Information Technology', 'Interior Design', 
    'Accounting and Business/Management', 'Nursing Science', 
    'Pediatric Nurse/Nursing', 'Pre-Medicine/Pre-Medical Studies', 
    'Environmental Science', 'Community Health and Preventive Medicine', 
    'Fashion Merchandising', 'Nursing Practice', 'History', 'Accounting', 
    'Liberal Arts and Sciences/Liberal Studies', 'Music Technology', 
    'Graphic Design', 'Psychology, Other', 'Clinical Psychology', 
    'Pre-Nursing Studies', 'English Language and Literature, General', 
    'Film/Video and Photographic Arts', 'Social Sciences', 'Law', 
    'Art History, Criticism and Conservation', 'Journalism', 
    'Computer/Information Technology Administration and', 'Creative Writing', 
    'Cyber/Computer Forensics and Counterterrorism', 'Applied Psychology', 
    'Mathematics', 'Physical Therapy Technician/Assistant', 
    'Digital Communication and Media/Multimedia', 
    'Legal Assistant/Paralegal', 'Communication and Media Studies', 
    'Management Science', 'Child Development', 
    'Maternal/Child Health and Neonatal Nurse/Nursing', 
    'Political Science and Government', 'Illustration', 
    'Drama and Dramatics/Theatre Arts, General', 'Accounting and Finance', 
    'Health and Wellness, General', 'Marketing', 
    'Health/Medical Preparatory Programs', 
    'Design and Visual Communications, General', 'Chemistry', 
    'Fine Arts and Art Studies, Other', 'Criminal Justice/Police Science', 
    'Electrical and Electronics Engineering', 
    'Cinematography and Film/Video Production', 'Criminology', 
    'Industrial and Organizational Psychology', 'Health Information/Medic', 
    'Family Practice Nurse/Nursing', 'Clinical Nutrition/Nutritionist', 
    'Occupational Therapist Assistant', 'Sociology', 
    'Film/Cinema/Video Studies', 'Computer Programming', 
    'Business/Commerce, General', 'Sports Communication', 
    'Forensic Psychology', 'Fiber, Textile and Weaving Arts', 
    'Nursing Education', 'Music Performance, General', 'Photography', 
    'International Relations and Affairs', 'Pre-Law Studies', 'Literature', 
    'Criminal Justice and Corrections', 'Human Resources Management and Services', 
    'Legal Studies, General', 'Commercial and Advertising Art', 
    'Graphic Communications, Other', 
    'Medical Insurance Specialist/Medical Biller', 'Philosophy', 
    'Broadcast Journalism', 'Counseling Psychology', 
    'Computer and Information Sciences, General', 'Legal Professions and Studies', 
    'Geographic Information Science and Cartography', 'Digital Arts', 
    'Small Business Administration/Management', 'Art/Art Studies, General', 
    'Health and Medical Administrative Services', 
    'Registered Nurse, Nursing Administration, Nursing', 
    'Criminalistics and Criminal Science', 
    'Mental Health Counseling/Counselor', 
    'Public Administration & Social Service Professions', 
    'Computer and Information Sciences, Other', 'Business and Personal/Fi', 
    'Medical/Clinical Assistant', 
    'Public Relations, Advertising, and Applied Communi', 
    'Project Management', 'Actuarial Science', 'Graphic Communications', 
    'Commercial Photography', 'Communication and Media Studies, Other', 
    'Fine/Studio Arts, General', 'Computer Engineering', 
    'Pastoral Studies/Counseling', 'Social Psychology', 
    'Medical Insurance Coding Specialist/Coder', 
    'International Business', 
    'Network and System Administration/Administrator', 
    'Nurse Midwife/Nursing Midwifery', 
    'Psychiatric/Mental Health Nurse/Nursing', 
    'Business, Management, Marketing, and Related Suppo', 
    'Medical/Health Managemen', 'Humanities/Humanistic Studies', 
    'Critical Care Nursing', 'Economics', 
    'Science Technologies/Technicians, Other', 
    'Finance and Financial Management Services', 
    'Entrepreneurship/Entrepreneurial Studies', 
    'Environmental Studies', 'Physics', 
    'Mass Communication/Media Studies', 
    'Computer Systems Networking and Telecommunications', 
    'Public Policy Analysis', 
    'Natural Resources/Conservation, General', 
    'Industrial Radiologic Technology/Technician', 
    'Business and Social Skills', 'Advertising', 
    'Administrative Assistant', 'Human Resources Manageme', 
    'Entrepreneurial and Small Business Operations', 
    'Health Professions and Related Clinical Sciences, ', 
    'Musical Theatre', 'Chemical Technology/Technician', 
    'Research and Experimental Psychology', 'Physical Sciences', 
    'Acting', 'Nurse Anesthetist', 
    'Computer/Information Technology Services Administr', 
    'Business/Corporate Communications', 'Investments and Securities', 
    'Health/Health Care Administration/Management', 
    'American Government and Politics (United States)', 
    'Visual and Performing Arts', 'Dietetic Technician', 
    'School Psychology', 'Health/Medical Psychology', 
    'Visual and Performing Arts, Other', 
    'Emergency Room/Trauma Nursing', 'Dietetics/Dietitian', 
    'Radio and Television', 'Forensic Science and Technology', 
    'Cognitive Psychology and Psycholinguistics', 
    'Optics/Optical Sciences', 'Substance Abuse/Addiction Counseling', 
    'Perioperative/Operating ', 'Clinical Nurse Specialist', 'Neuroscience', 
    'Forestry', 'Arts, Entertainment,and Media Management', 
    'Computer Graphics', 'Radio, Television, and Digital Communication, Othe', 
    'Clinical, Counseling and Applied Psychology', 
    'Community Health Services/Liaison/Counseling', 
    'Dietetics and Clinical Nutrition Services', 
    'Banking, Corporate, Finance, and Securities Law', 
    'Medical Office Assistant/Specialist', 'Biology Technician/Biote', 
    'Logistics, Materials, and Supply Chain Management', 
    'Criminal Justice/Safety Studies', 'Physician Assistant', 
    'Management Information Systems, General', 'Nursing Administration', 
    'Voice and Opera', 'Hospitality and Recreation Marketing Operations', 
    'Anthropology', 'Computer Software Engineering', 
    'Biomedical Sciences, General', 
    'American/U.S. Law/Legal Studies/Jurisprudence', 
    'Web Page, Digital/Multimedia and Information Res', 
    'Physical Therapy/Therapist', 'Intermedia/Multimedia', 
    'Wildlife, Fish and Wildlands Science/Management', 
    'Professional, Technical,', 'Developmental and Child Psychology', 
    'Pre-Physical Therapy Studies', 'Health Services Administration', 
    'Athletic Training/Trainer', 
    'Cell/Cellular Biology and Anatomical Sciences', 
    'Ceramic Arts and Ceramics', 'Audiology/Audiologist an', 
    'Medical Office Management/Administration', 
    'Mathematics and Statistics', 
    'Computer and Information Sciences and Support Serv', 
    'International Finance', 
    'Marketing/Marketing Management, General', 
    'Family Psychology', 'Hospital and Health Care', 
    'Behavioral Aspects of Health', 
    'Web/Multimedia Management and Webmaster', 
    'Communication, Journalism, Other', 'Computer Engineering, Other', 
    'Computer Software and Media Applications, Other', 
    'Liberal Arts and Sciences, General Studies and Hum', 
    'Social Sciences, Other', 'Geriatric Nurse/Nursing', 
    'Palliative Care Nursing', 'Applied Mathematics', 
    'Pre-Veterinary Studies', 'Education Policy Analysis', 
    'Educational Psychology', 'Speech Communication and Rhetoric', 
    'Occupational Therapy/Therapist', 
    'Biochemistry and Molecular Biology', 
    'Management Information Systems and Services', 
    'Human Resources Development', 
    'Mental and Social Health Services and Allied Profe', 
    'Natural Resources and Conservation', 'Education, General', 
    'Cell/Cellular and Molecular Biology', 
    'Communication Disorders Sciences and Services', 
    'Accounting and Related Services', 'Computer Support Specialist', 
    'Dance', 'Financial Forensics and Fraud Investigation', 
    'Business/Managerial Economics', 'Nuclear and Industrial R', 
    'Court Reporting/Court Reporter', 
    'Psychoanalysis and Psychotherapy', 'Radiologist Assistant', 
    'Electrical, Electronics and Communications Enginee', 
    'Mathematical Biology', 'Pharmacy Technician/Assistant', 
    'Prepress/Desktop Publishing & Digital Imaging Desi', 
    'Computer Programming, Other', 
    'Computer Software and Media Applications', 
    'Philosophy and Religious Studies', 
    'Econometrics and Quantitative Economics', 
    'Visual and Performing Arts, General', 
    'Health/Medical Preparatory Programs, Other', 
    'Clinical/Medical Social Work', 'Clinical Child Psychology', 
    'Public Administration', 'Speech-Language Pathology Assistant', 
    'Drama/Theatre Arts and Stagecraft', 'Geography', 
    'Spanish Language and Literature', 'Psychology, General', 
    'International Marketing', 
    'Computer Programming, Specific Applications'
]

# Dictionary mapping of academic interests to University majors
interests_to_majors = {
    'Biology, General': 'Biology',
    'Psychology': 'Psychology',
    'Game and Interactive Media Design': 'Game Design and Development',
    'Liberal Arts and Science': 'Undecided: College of Arts & Sciences',
    'Biology/Biological Sciences, General': 'Biology',
    'Liberal Arts & Sciences, Gen Studies & Humanities': 'Undecided: College of Arts & Sciences',
    'Social Work': 'Social Work',
    'Design and Applied Arts': 'Art and Design',
    'Allied Health and Medical Assisting Services': 'Health Science',
    'Radiologic Technology/Science - Radiographer': 'Radiography',
    'Communication, General': 'Strategic Communication, Public Relations and Advertising',
    'Human Services, General': 'Social Work',
    'Speech-Language Pathology/Pathologist': 'Communication Disorders',
    'Adult Health Nurse/Nursing': 'Nursing',
    'Personality Psychology': 'Psychology',
    'Animation, Interactive Technology, Video Graphics': 'Immersive Media and Mixed Reality',
    'Fine and Studio Arts': 'Art and Design',
    'Fashion/Apparel Design': 'Fashion Design',
    'General Studies': 'General Studies',
    'Computer & Information Sciences & Support Services': 'Computer Science',
    'Business/Office Automation/Technology/Data Entry': 'Information Technology',
    'Finance, General': 'Finance',
    'Computer and Information Systems Security': 'Information Technology',
    'Health Services/Allied Health/Health Sciences, Gen': 'Health Science',
    'Computer Science': 'Computer Science',
    'Pre-Dentistry Studies': 'Biology',
    'Business Administration, Management and Operations': 'Management',
    'Veterinary/Animal Health': 'Biology',
    'Public Health': 'Health Science',
    'Criminal Justice/Law Enforcement Administration': 'Criminal Justice',
    'Registered Nursing/Registered Nurse': 'Nursing',
    'American  History (United States)': 'History',
    'Biochemistry': 'Biochemistry',
    'Information Technology': 'Information Technology',
    'Interior Design': 'Art and Design',
    'Accounting and Business/Management': 'Accounting',
    'Nursing Science': 'Nursing',
    'Pediatric Nurse/Nursing': 'Nursing',
    'Pre-Medicine/Pre-Medical Studies': 'Biology',
    'Environmental Science': 'Environmental Science',
    'Community Health and Preventive Medicine': 'Health Science',
    'Fashion Merchandising': 'Fashion Marketing and Merchandising',
    'Nursing Practice': 'Nursing',
    'History': 'History',
    'Accounting': 'Accounting',
    'Liberal Arts and Sciences/Liberal Studies': 'Undecided: College of Arts and Sciences',
    'Music Technology': 'Music',
    'Graphic Design': 'Art and Design',
    'Psychology, Other': 'Psychology',
    'Clinical Psychology': 'Psychology',
    'Pre-Nursing Studies': 'Nursing',
    'English Language and Literature, General': 'English',
    'Film/Video and Photographic Arts': 'Media Arts',
    'Social Sciences': 'Psychology',
    'Law': 'Law',
    'Art History, Criticism and Conservation': 'Art and Design',
    'Journalism': 'English',
    'Computer/Information Technology Administration and': 'Information Technology',
    'Creative Writing': 'English',
    'Cyber/Computer Forensics and Counterterrorism': 'Criminal Justice',
    'Applied Psychology': 'Psychology',
    'Mathematics': 'Mathematics',
    'Physical Therapy Technician/Assistant': 'Exercise Science',
    'Digital Communication and Media/Multimedia': 'Media Arts',
    'Legal Assistant/Paralegal': 'Legal Studies',
    'Communication and Media Studies': 'Strategic Communication, Public Relations and Advertising',
    'Management Science': 'Management',
    'Child Development': 'Child Development',
    'Maternal/Child Health and Neonatal Nurse/Nursing': 'Nursing',
    'Political Science and Government': 'Political Science',
    'Illustration': 'Art and Design',
    'Drama and Dramatics/Theatre Arts, General': 'Theatre Arts',
    'Accounting and Finance': 'Accounting',
    'Health and Wellness, General': 'Health Science',
    'Marketing': 'Marketing',
    'Health/Medical Preparatory Programs': 'Health Science',
    'Design and Visual Communications, General': 'Strategic Communication, Public Relations and Advertising',
    'Chemistry': 'Chemistry',
    'Fine Arts and Art Studies, Other': 'Art and Design',
    'Criminal Justice/Police Science': 'Criminal Justice',
    'Electrical and Electronics Engineering': 'Electrical Engineering',
    'Cinematography and Film/Video Production': 'Media Arts',
    'Criminology': 'Criminal Justice',
    'Industrial and Organizational Psychology': 'Psychology',
    'Health Information/Medic': 'Health Science',
    'Family Practice Nurse/Nursing': 'Nursing',
    'Clinical Nutrition/Nutritionist': 'Health Science',
    'Occupational Therapist Assistant': 'Occupational Therapy',
    'Sociology': 'Sociology',
    'Film/Cinema/Video Studies': 'Media Arts',
    'Computer Programming': 'Computer Science',
    'Business/Commerce, General': 'Business Administration',
    'Sports Communication': 'Communication',
    'Forensic Psychology': 'Psychology',
    'Fiber, Textile and Weaving Arts': 'Art and Design',
    'Nursing Education': 'Nursing',
    'Music Performance, General': 'Music',
    'Photography': 'Photography',
    'International Relations and Affairs': 'Global Affairs',
    'Pre-Law Studies': 'Political Science',
    'Literature': 'English',
    'Criminal Justice and Corrections': 'Criminal Justice',
    'Human Resources Management and Services': 'Management',
    'Legal Studies, General': 'Law',
    'Commercial and Advertising Art': 'Art and Design',
    'Graphic Communications, Other': 'Art and Design',
    'Medical Insurance Specialist/Medical Biller': 'Health Science',
    'Philosophy': 'Philosophy',
    'Broadcast Journalism': 'English',
    'Counseling Psychology': 'Psychology',
    'Computer and Information Sciences, General': 'Computer Science',
    'Legal Professions and Studies': 'Law',
    'Geographic Information Science and Cartography': 'Information Technology',
    'Digital Arts': 'Art and Design',
    'Small Business Administration/Management': 'Management',
    'Art/Art Studies, General': 'Undecided:College of Arts and Sciences',
    'Health and Medical Administrative Services': 'Health Science',
    'Registered Nurse, Nursing Administration, Nursing': 'Nursing',
    'Criminalistics and Criminal Science': 'Criminal Justice',
    'Mental Health Counseling/Counselor': 'Psychology',
    'Public Administration & Social Service Professions': 'Management',
    'Computer and Information Sciences, Other': 'Computer Science',
    'Business and Personal/Fi': 'Managemennt',
    'Medical/Clinical Assistant': 'Health Science',
    'Public Relations, Advertising, and Applied Communi': 'Strategic Communication, Public Relations and Advertising',
    'Project Management': 'Management',
    'Actuarial Science': 'Mathematics',
    'Graphic Communications': 'Art and Design',
    'Commercial Photography': 'Art and Design',
    'Communication and Media Studies, Other': 'Media Arts',
    'Fine/Studio Arts, General': 'Art and Design',
    'Computer Engineering': 'Electrical Engineering',
    'Pastoral Studies/Counseling': 'Psychology',
    'Social Psychology': 'Psychology',
    'Medical Insurance Coding Specialist/Coder': 'Health Science',
    'International Business': 'Global Affairs',
    'Network and System Administration/Administrator': 'Information Technology',
    'Nurse Midwife/Nursing Midwifery': 'Nursing',
    'Psychiatric/Mental Health Nurse/Nursing': 'Nursing',
    'Business, Management, Marketing, and Related Suppo': 'Marketing',
    'Medical/Health Managemen': 'Health Science',
    'Humanities/Humanistic Studies': 'Undecided: College of Arts and Sciences',
    'Critical Care Nursing': 'Nursing',
    'Economics': 'Economics',
    'Science Technologies/Technicians, Other': 'Information Technology',
    'Finance and Financial Management Services': 'Finance',
    'Entrepreneurship/Entrepreneurial Studies': 'Entrepreneurship',
    'Environmental Studies': 'Biology',
    'Physics': 'Physics',
    'Mass Communication/Media Studies': 'Communication',
    'Computer Systems Networking and Telecommunications': 'Information Technology',
    'Public Policy Analysis': 'Public Policy',
    'Natural Resources/Conservation, General': 'Biology',
    'Industrial Radiologic Technology/Technician': 'Radiography',
    'Business and Social Skills': 'Management',
    'Advertising': 'Marketing',
    'Administrative Assistant': 'Management',
    'Human Resources Manageme': 'Management',
    'Entrepreneurial and Small Business Operations': 'Entrepreneurship',
    'Health Professions and Related Clinical Sciences, ': 'Health Science',
    'Musical Theatre': 'Theatre Arts',
    'Chemical Technology/Technician': 'Chemistry',
    'Research and Experimental Psychology': 'Psychology',
    'Physical Sciences': 'Exercise Science',
    'Acting': 'Theatre Arts',
    'Nurse Anesthetist': 'Nursing',
    'Computer/Information Technology Services Administr': 'Information Technology',
    'Business/Corporate Communications': 'Strategic Communication, Public Relations and Advertising',
    'Investments and Securities': 'Finance',
    'Health/Health Care Administration/Management': 'Health Science',
    'American Government and Politics (United States)': 'Political Science',
    'Visual and Performing Arts': 'Undecided: College of Arts and Sciences',
    'Dietetic Technician': 'Biology',
    'School Psychology': 'Psychology',
    'Health/Medical Psychology': 'Psychology',
    'Visual and Performing Arts, Other': 'Undecided: College of Arts and Sciences',
    'Emergency Room/Trauma Nursing': 'Nursing',
    'Dietetics/Dietitian': 'Nutrition',
    'Radio and Television': 'Media Arts',
    'Forensic Science and Technology': 'Psychology',
    'Cognitive Psychology and Psycholinguistics': 'Psychology',
    'Optics/Optical Sciences': 'Physics',
    'Substance Abuse/Addiction Counseling': 'Psychology',
    'Perioperative/Operating ': 'Nursing',
    'Clinical Nurse Specialist': 'Nursing',
    'Neuroscience': 'Neuroscience',
    'Forestry': 'Biology',
    'Arts, Entertainment,and Media Management': 'Management',
    'Computer Graphics': 'Art and Design',
    'Radio, Television, and Digital Communication, Othe': 'Media Arts',
    'Clinical, Counseling and Applied Psychology': 'Psychology',
    'Community Health Services/Liaison/Counseling': 'Health Science',
    'Dietetics and Clinical Nutrition Services': 'Health Science',
    'Banking, Corporate, Finance, and Securities Law': 'Law',
    'Medical Office Assistant/Specialist': 'Health Science',
    'Biology Technician/Biote': 'Biology',
    'Logistics, Materials, and Supply Chain Management': 'Management',
    'Criminal Justice/Safety Studies': 'Criminal Justice',
    'Physician Assistant': 'Biology',
    'Management Information Systems, General': 'Information Technology',
    'Nursing Administration': 'Nursing',
    'Voice and Opera': 'Music',
    'Hospitality and Recreation Marketing Operations': 'Management',
    'Anthropology': 'Anthropology',
    'Computer Software Engineering': 'Computer Engineering',
    'Biomedical Sciences, General': 'Biology',
    'American/U.S. Law/Legal Studies/Jurisprudence': 'Law',
    'Web Page, Digital/Multimedia and Information Res': 'Art and Design',
    'Physical Therapy/Therapist': 'Exercise Science',
    'Intermedia/Multimedia': 'Immersive Media and Mixed Reality',
    'Wildlife, Fish and Wildlands Science/Management': 'Biology',
    'Professional, Technical,': 'General Studies',
    'Developmental and Child Psychology': 'Psychology',
    'Pre-Physical Therapy Studies': 'Exercise Science',
    'Health Services Administration': 'Health Science',
    'Athletic Training/Trainer': 'Exercise Science',
    'Cell/Cellular Biology and Anatomical Sciences': 'Biology',
    'Ceramic Arts and Ceramics': 'Art and Design',
    'Audiology/Audiologist an': 'Communication Disorders',
    'Medical Office Management/Administration': 'Health Science',
    'Mathematics and Statistics': 'Mathematics',
    'Computer and Information Sciences and Support Serv': 'Computer Science',
    'International Finance': 'Finance',
    'Marketing/Marketing Management, General': 'Marketing',
    'Family Psychology': 'Psychology',
    'Hospital and Health Care': 'Health Science',
    'Behavioral Aspects of Health': 'Health Science',
    'Web/Multimedia Management and Webmaster': 'Information Technology',
    'Communication, Journalism, Other': 'Strategic Communication, Public Relations and Advertising',
    'Computer Engineering, Other': 'Computer Engineering',
    'Computer Software and Media Applications, Other': 'Computer Engineering',
    'Liberal Arts and Sciences, General Studies and Hum': 'General Studies',
    'Social Sciences, Other': 'Sociology',
    'Geriatric Nurse/Nursing': 'Nursing',
    'Palliative Care Nursing': 'Nursing',
    'Applied Mathematics': 'Mathematics',
    'Pre-Veterinary Studies': 'Biology',
    'Education Policy Analysis': 'Education',
    'Educational Psychology': 'Psychology',
    'Speech Communication and Rhetoric': 'English',
    'Occupational Therapy/Therapist': 'Biology',
    'Biochemistry and Molecular Biology': 'Biochemistry',
    'Management Information Systems and Services': 'Information Technology',
    'Human Resources Development': 'Management',
    'Mental and Social Health Services and Allied Profe': 'Psychology',
    'Natural Resources and Conservation': 'Biology',
    'Education, General': 'Education',
    'Cell/Cellular and Molecular Biology': 'Biology',
    'Communication Disorders Sciences and Services': 'Communication Disorders',
    'Accounting and Related Services': 'Accounting',
    'Computer Support Specialist': 'Computer Science',
    'Dance': 'Dance',
    'Financial Forensics and Fraud Investigation': 'Criminal Justice',
    'Business/Managerial Economics': 'Economics',
    'Nuclear and Industrial R': 'Electrical Engineering',
    'Court Reporting/Court Reporter': 'Criminal Justice',
    'Psychoanalysis and Psychotherapy': 'Psychology',
    'Radiologist Assistant': 'Radiologic Technology',
    'Electrical, Electronics and Communications Enginee': 'Electrical Engineering',
    'Mathematical Biology': 'Mathematics',
    'Pharmacy Technician/Assistant': 'Biology',
    'Prepress/Desktop Publishing & Digital Imaging Desi': 'Art and Design',
    'Computer Programming, Other': 'Computer Science',
    'Computer Software and Media Applications': 'Computer Engineering',
    'Philosophy and Religious Studies': 'Philosophy',
    'Econometrics and Quantitative Economics': 'Economics',
    'Visual and Performing Arts, General': 'Art and Design',
    'Health/Medical Preparatory Programs, Other': 'Health Science',
    'Clinical/Medical Social Work': 'Social Work',
    'Clinical Child Psychology': 'Psychology',
    'Public Administration': 'Management',
    'Speech-Language Pathology Assistant': 'Communication Disorders',
    'Drama/Theatre Arts and Stagecraft': 'Theatre Arts',
    'Geography': 'History',
    'Spanish Language and Literature': 'Spanish',
    'Psychology, General': 'Psychology',
    'International Marketing': 'Marketing',
    'Computer Programming, Specific Applications': 'Computer Science'
}

# Function to find the closest match from a list of options
def find_closest_match(interest):
    # Check if the value is a string
    if isinstance(interest, str):
        matches = difflib.get_close_matches(interest, interests_to_majors.keys(), n=1)
        if matches:
            return interests_to_majors[matches[0]]
        else:
            return None
    else:
        return None

# Convert academic interests to majors
majors = [find_closest_match(interest) for interest in academic_interests]

# Print the mapping of academic interests to majors
for interest, major in zip(academic_interests, majors):
    print(f'{interest}: {major}')

Biology, General: Biology
Psychology: Psychology
Game and Interactive Media Design: Game Design and Development
Liberal Arts and Science: Undecided: College of Arts & Sciences
Biology/Biological Sciences, General: Biology
Liberal Arts & Sciences, Gen Studies & Humanities: Undecided: College of Arts & Sciences
Social Work: Social Work
Design and Applied Arts: Art and Design
Allied Health and Medical Assisting Services: Health Science
Radiologic Technology/Science - Radiographer: Radiography
Communication, General: Strategic Communication, Public Relations and Advertising
Human Services, General: Social Work
Speech-Language Pathology/Pathologist: Communication Disorders
Adult Health Nurse/Nursing: Nursing
Personality Psychology: Psychology
Animation, Interactive Technology, Video Graphics: Immersive Media and Mixed Reality
Fine and Studio Arts: Art and Design
Fashion/Apparel Design: Fashion Design
General Studies: General Studies
Computer & Information Sciences & Support Services: Comput

In [12]:
#Apply closest match function
df['Area of Interest 1'] = df['Area of Interest 1'].apply(lambda x: find_closest_match(x))

In [13]:
#Convert Year to Graduation Year
df['Year'] = df['Graduation Year'].dt.year

In [14]:
#Convert Graduation Year 
condition = df['Graduation Year'] < '2024-08-30'
values = np.where(condition, 'Fall 2024', 'Spring 2025')

# Create the new column 'Academic Term'
df['Academic Term'] = values

In [15]:
#Drop unneeded Graduation Year column following successful conversion
df = df.drop(columns='Graduation Year')

In [16]:
df = df.rename(columns={'Year':'Graduation Year'})

In [17]:
#Add column for admit type for internal data organization
df['Admit Type'] = df['Admit Type'].fillna('FT')

In [18]:
#Final dataframe display
df

Unnamed: 0,First Name,Last Name,Address State,Text Permission,Ethnicity,Birthdate,Gender,GPA,Area of Interest 1,School Name,CEEB,Admit Type,Graduation Year,Academic Term
0,Student 1 First Name,Student 1 Last Name,PA,Y,Other,1986-04-03,Female,3.83,Biology,Community College of Philadelphia,2682.0,FT,2024,Fall 2024
1,Student 2 First Name,Student 2 Last Name,PA,Y,Black/African American,2002-01-29,Female,3.50,Psychology,Harrisburg Area Community College,2309.0,FT,2024,Fall 2024
2,Student 3 First Name,Student 3 Last Name,NJ,Y,More than one race,2006-05-11,Female,3.83,Psychology,Bergen Community College,2032.0,FT,2024,Fall 2024
3,Student 4 First Name,Student 4 Last Name,MA,Y,More than one race,2002-02-12,Male,3.80,Game Design and Development,Quinsigamond Community College,3714.0,FT,2024,Fall 2024
4,Student 5 First Name,Student 5 Last Name,PA,Y,White,2002-01-21,Male,3.60,Undecided: College of Arts & Sciences,Delaware County Community College,2125.0,PT,2024,Fall 2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5483,Student 5484 First Name,Student 5484 Last Name,NJ,Y,I prefer not to answer,2004-07-14,Female,3.80,Biology,Bergen Community College,2032.0,FT,2024,Spring 2025
5484,Student 5485 First Name,Student 5485 Last Name,PA,Y,White,2003-10-04,Male,3.50,Art and Design,Northampton Community College,2573.0,PT,2024,Spring 2025
5485,Student 5486 First Name,Student 5486 Last Name,NJ,Y,White,1999-01-11,Male,3.90,Criminal Justice,Hudson County Community College,2291.0,FT,2024,Spring 2025
5486,Student 5487 First Name,Student 5487 Last Name,NJ,Y,Black/African American,2003-11-14,Female,3.88,Computer Science,Bergen Community College,2032.0,FT,2024,Spring 2025


In [19]:
#Export
#df.to_excel("PTK-Data-Cleaned.xlsx")