# Francesca Bruno
# 10/1/2024 (Updated 10/17/2024)
# Part Two of Project 1 (Cleaning Data)
## Cleaning the data on the 2024 Non-Majors Survey Results and 2024 Majors Survey Results

In [162]:
import pandas as pd
df = pd.read_csv("Non-Majors Survey Results - Fall 2024.csv")
df2 = pd.read_csv("MajorsSurveyResults2024.csv")

## Using replace to swap all spaces with underscores so the names of the columns can be used in the code, and making everything lowercase for convenience

In [164]:
df2.columns = df2.columns.str.replace(" ", "_")
df2.columns = df2.columns.str.lower()
df2.rename(columns=({'age_':'age'}), inplace = True)
print(df2.columns) 

df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.lower()
print(df.columns)

Index(['timestamp', 'which_course_are_you_enrolled_in?',
       'how_did_you_hear_about_county_college_of_morris?_[ccm_web_site]',
       'how_did_you_hear_about_county_college_of_morris?_[social_media]',
       'how_did_you_hear_about_county_college_of_morris?_[community_event]',
       'how_did_you_hear_about_county_college_of_morris?_[family_member_or_friend]',
       'how_did_you_hear_about_county_college_of_morris?_[current_ccm_student]',
       'how_did_you_hear_about_county_college_of_morris?_[ccm_alumni]',
       'how_did_you_hear_about_county_college_of_morris?_[high_school_teacher]',
       'how_did_you_hear_about_county_college_of_morris?_[high_school_counselor]',
       'how_did_you_hear_about_county_college_of_morris?_[in-app_advertisement]',
       'how_did_you_hear_about_county_college_of_morris?_[employer]',
       'how_did_you_hear_about_county_college_of_morris?_[billboard]',
       'how_did_you_hear_about_county_college_of_morris?_[television]',
       'how_did_you_h

## removing the timestamp column because it is unnecessary to the analysis, and removing the extra underscore after the name of the Age column

In [166]:
df2 = df2['age']

df.drop(axis = 1, columns=['timestamp'], inplace = True)
df.rename(columns=({'age_':'age'}), inplace = True)
print(df.columns)

Index(['which_course_are_you_currently_enrolled_in?',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[it’s_a_required_class_for_the_degree_i’m_seeking]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[to_keep_current_in_computing_skills]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[career_advancement]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[career_change]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[professional_development]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[job_displacement]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[relocation]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[it_industry_certifications]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[financial]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[personal_enrichment]',
       'what_motivated_you_to_seek_a_computing_class_at_ccm?_[curios

## Replacing every response to Race/Ethnicity that select multiple options with 'Multi-Racial' using regular expressions.
### Every raw string (r'), not case sensitive (?i), that anywhere from the very beginning (^) contains ';' (.*;.*) anywhere until the very end ($) should be changed to 'Multi-Racial'

In [168]:
# original (same for df2)
df['race/ethnicity']

0     Hispanic or Latino;Black/African American;Whit...
1                                    Hispanic or Latino
2                                   Choose not to reply
3                                    Hispanic or Latino
4                                       White/Caucasian
                            ...                        
87                   Hispanic or Latino;White/Caucasian
88                                   Hispanic or Latino
89                               Black/African American
90                                   Hispanic or Latino
91                                                Asian
Name: race/ethnicity, Length: 92, dtype: object

In [169]:
# fixed
fixed = {
    r'(?i)^.*;.*$': 'Multi-Racial'
}

df['race/ethnicity'] = df['race/ethnicity'].replace(fixed, regex=True)
print(df['race/ethnicity'])

0               Multi-Racial
1         Hispanic or Latino
2        Choose not to reply
3         Hispanic or Latino
4            White/Caucasian
               ...          
87              Multi-Racial
88        Hispanic or Latino
89    Black/African American
90        Hispanic or Latino
91                     Asian
Name: race/ethnicity, Length: 92, dtype: object


## Dropping some unnecessary columns

In [171]:
df = df.drop(df.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68]], axis = 1)
print(df.columns)

Index(['which_course_are_you_currently_enrolled_in?',
       'prior_to_applying_to_college,_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_and/or_with_the_department_of_information_technologies,_if_at_all?_[open_house]',
       'prior_to_applying_to_college,_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_and/or_with_the_department_of_information_technologies,_if_at_all?_[instant_decision_day]',
       'prior_to_applying_to_college,_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_and/or_with_the_department_of_information_technologies,_if_at_all?_[on-campus_information_session]',
       'prior_to_applying_to_college,_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_and/or_with_the_department_of_information_technologies,_if_at_all?_[titan's_tuesday_virtual_information_session]',
       'prior_to

## Renaming columns

In [173]:
df.rename(columns={df.columns[0]:'course',df.columns[1]:'openHouse',df.columns[2]:'instantDecision',df.columns[3]:'onCampInfo',df.columns[4]:'virtualInfo',df.columns[5]:'womenWhoDare',df.columns[6]:'collegeFair',df.columns[7]:'schoolCompsciClass',df.columns[8]:'schoolCompsciClub',df.columns[9]:'competitions',df.columns[10]:'afterschoolCamp',df.columns[11]:'summerCamp',df.columns[12]:'AP',df.columns[13]:'dualEnrollment',df.columns[14]:'familyFriendInfluence',df.columns[15]:'familyFriendWork',df.columns[16]:'highschoolTeacher',df.columns[17]:'employer'}, inplace = True)
print(df.columns)

Index(['course', 'openHouse', 'instantDecision', 'onCampInfo', 'virtualInfo',
       'womenWhoDare', 'collegeFair', 'schoolCompsciClass',
       'schoolCompsciClub', 'competitions', 'afterschoolCamp', 'summerCamp',
       'AP', 'dualEnrollment', 'familyFriendInfluence', 'familyFriendWork',
       'highschoolTeacher', 'employer',
       'how_did_you_hear_about_county_college_of_morris__[ccm_web_site]',
       'age'],
      dtype='object')


## Saving it to a new csv file

In [175]:
df.to_csv('CLEANED_NonMajors2024.csv', index = False) 
df2.to_csv('CLEANED_Majors2024.csv', index = False)