### Lonet Joseph
### October 13, 2023
### Explore the Project 1 dataset using Python, and perform data cleaning using pandas.

### Data Exploration

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv("C:\\Users\\jlone\\OneDrive\Desktop\\CCM Computing Entry Survey - Fall 2022.csv")

In [18]:
# Basic properties and summary statistics

# Shape of the DataFrame
shape_info = df.shape

# Information about the DataFrame
df_info = df.info()

# Descriptive statistics of the DataFrame
desc_stats = df.describe(include='all')

shape_info, desc_stats.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 92 columns):
 #   Column                                                                                                                                                                                                                                                           Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                                                           --------------  -----  
 0   Timestamp                                                                                                                                                                                                                                                        453 non-null    object 
 1   Which course are you enrolled in

((453, 92),
                         Timestamp Which course are you enrolled in?  \
 count                         453                               449   
 unique                        450                                 5   
 top     2022/10/05 9:38:11 AM AST        CMP 128 Computer Science I   
 freq                            2                               268   
 mean                          NaN                               NaN   
 
        How did you hear about County College of Morris? [CCM Web site]  \
 count                                                 453                
 unique                                                  3                
 top                                                    No                
 freq                                                  265                
 mean                                                  NaN                
 
        How did you hear about County College of Morris? [Social Media]  \
 count                     

In [19]:
 # Listing all column names
column_names = df.columns.tolist()
column_names


['Timestamp',
 'Which course are you enrolled in?',
 'How did you hear about County College of Morris? [CCM Web site]',
 'How did you hear about County College of Morris? [Social Media]',
 'How did you hear about County College of Morris? [Community Event]',
 'How did you hear about County College of Morris? [Family member or friend]',
 'How did you hear about County College of Morris? [Current CCM student]',
 'How did you hear about County College of Morris? [CCM Alumni]',
 'How did you hear about County College of Morris? [High School Teacher]',
 'How did you hear about County College of Morris? [High School Counselor]',
 'How did you hear about County College of Morris? [In-app advertisement]',
 'How did you hear about County College of Morris? [Employer]',
 'How did you hear about County College of Morris? [Billboard]',
 'How did you hear about County College of Morris? [Television]',
 'How did you hear about County College of Morris? [Radio]',
 'How did you hear about County Colle

### Data Cleaning

In [20]:
 # Renaming the columns

# Simplifying "How did you hear about County College of Morris?" columns
hear_about_ccm_mapping = {
    col: 'heard_from_' + col.split('[')[-1].split(']')[0].replace(' ', '_').lower() for col in df.columns if 'How did you hear about County College of Morris?' in col
}

# Simplifying "To what extent did the following impact your decision to attend County College of Morris?" columns
impact_decision_mapping = {
    col: 'impact_' + col.split('[')[-1].split(']')[0].replace(' ', '_').lower() for col in df.columns if 'To what extent did the following impact your decision to attend County College of Morris?' in col
}

# Simplifying "What motivated you to seek a computing degree/certificate at CCM?" columns
motivation_mapping = {
    col: 'motivation_' + col.split('[')[-1].split(']')[0].replace(' ', '_').lower() for col in df.columns if 'What motivated you to seek a computing degree/certificate at CCM?' in col
}

# Simplifying "Did you receive information about the CCM computing programs from any of the following sources?" columns
info_sources_mapping = {
    col: 'info_from_' + col.split('[')[-1].split(']')[0].replace(' ', '_').lower() for col in df.columns if 'Did you receive information about the CCM computing programs from any of the following sources?' in col
}

# Manually renaming some columns for brevity and clarity
manual_mapping = {
    'Which course are you enrolled in?': 'course_enrolled',
    'Was a computing major/certificate your first choice, or did you change majors from a different CCM program? If you changed majors, indicate what your first major was.': 'major_change_details',
    'On a scale of 1 to 5, with 1 being not at all interested and 5 being extremely interested, how interested are you in taking more computing classes?': 'interest_in_computing_classes',
    'Please explain your answer to the question above.  Why or why not would you be interested in taking another computing class?': 'reason_interest_in_computing',
    'Gender': 'gender',
    'Race/ethnicity': 'race_ethnicity',
    'Age ': 'age'  # Note: The original column name has an extra space
}

# Combining all renaming mappings
all_mappings = {**hear_about_ccm_mapping, **impact_decision_mapping, **motivation_mapping, **info_sources_mapping, **manual_mapping}

# Applying the renaming
df_renamed = df.rename(columns=all_mappings)

# Displaying the renamed columns
df_renamed.columns.tolist()



['Timestamp',
 'course_enrolled',
 'heard_from_ccm_web_site',
 'heard_from_social_media',
 'heard_from_community_event',
 'heard_from_family_member_or_friend',
 'heard_from_current_ccm_student',
 'heard_from_ccm_alumni',
 'heard_from_high_school_teacher',
 'heard_from_high_school_counselor',
 'heard_from_in-app_advertisement',
 'heard_from_employer',
 'heard_from_billboard',
 'heard_from_television',
 'heard_from_radio',
 'heard_from_home_mailer',
 'heard_from_other',
 'impact_affordable_cost',
 'impact_location/convenience',
 'impact_choice_of_programs',
 'impact_online_offerings',
 'impact_family/friend_referral',
 'impact_faculty/staff',
 'impact_college_reputation',
 'impact_financial_aid',
 'impact_scholarships',
 'impact_small_class_sizes',
 'impact_extra-curricular_opportunities',
 'impact_accepted_my_transfer_credits',
 'impact_negative_experience_at_another_college',
 'impact_nj_stars_program',
 'impact_ability_to_transfer_ccm_credits_to_a_4-year_school',
 'impact_get_college_

In [21]:
# Calculating percentage of missing values for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Displaying columns with their missing percentage
missing_percentage.sort_values(ascending=False)


Please explain your answer to the question above.  Why or why not would you be interested in taking another computing class?                                                                                                            73.951435
On a scale of 1 to 5, with 1 being not at all interested and 5 being extremely interested, how interested are you in taking more computing classes?                                                                                     73.951435
To what extent did the following impact your decision to attend County College of Morris? [NJ Stars Program]                                                                                                                            26.710817
Was a computing major/certificate your first choice, or did you change majors from a different CCM program? If you changed majors, indicate what your first major was.                                                                  26.269316
What motivated you to seek a com

In [22]:
# Categorizing columns based on relevance

# Highly Relevant columns
highly_relevant = [
    'course_enrolled',
    'heard_from_ccm_web_site',
    'heard_from_social_media',
    'heard_from_community_event',
    'heard_from_family_member_or_friend',
    'heard_from_current_ccm_student',
    'heard_from_ccm_alumni',
    'heard_from_high_school_teacher',
    'heard_from_high_school_counselor',
    'major_change_details',
    'motivation_to_get_a_job_in_the_computing_field',
    'motivation_transfer_to_bachelor\'s_level_program',
    'motivation_career_advancement',
    'motivation_career_change',
    'motivation_professional_development',
    'info_from_high_school_guidance_counselor',
    'info_from_high_school_teacher',
    'info_from_ccm_information_technologies_website',
    'info_from_ccm_admissions',
    'info_from_ccm_advisor/counselor',
    'gender',
    'race_ethnicity',
    'age'
]

# Potentially Relevant columns
potentially_relevant = [
    'impact_affordable_cost',
    'impact_location/convenience',
    'impact_choice_of_programs',
    'impact_online_offerings',
    'impact_family/friend_referral',
    'impact_college_reputation',
    'interest_in_computing_classes',
    'reason_interest_in_computing'
]

# Less Relevant columns
less_relevant = [
    'Timestamp',
    'info_from_employer',
    'info_from_ccm_workforce_development',
    'info_from_nj_workforce_development_program',
    'info_from_other'
]

# Irrelevant columns are those that are not in the above categories
irrelevant = [col for col in df_renamed.columns if col not in (highly_relevant + potentially_relevant + less_relevant)]

categorized_columns = {
    'Highly Relevant': highly_relevant,
    'Potentially Relevant': potentially_relevant,
    'Less Relevant': less_relevant,
    'Irrelevant': irrelevant
}

categorized_columns


{'Highly Relevant': ['course_enrolled',
  'heard_from_ccm_web_site',
  'heard_from_social_media',
  'heard_from_community_event',
  'heard_from_family_member_or_friend',
  'heard_from_current_ccm_student',
  'heard_from_ccm_alumni',
  'heard_from_high_school_teacher',
  'heard_from_high_school_counselor',
  'major_change_details',
  'motivation_to_get_a_job_in_the_computing_field',
  "motivation_transfer_to_bachelor's_level_program",
  'motivation_career_advancement',
  'motivation_career_change',
  'motivation_professional_development',
  'info_from_high_school_guidance_counselor',
  'info_from_high_school_teacher',
  'info_from_ccm_information_technologies_website',
  'info_from_ccm_admissions',
  'info_from_ccm_advisor/counselor',
  'gender',
  'race_ethnicity',
  'age'],
 'Potentially Relevant': ['impact_affordable_cost',
  'impact_location/convenience',
  'impact_choice_of_programs',
  'impact_online_offerings',
  'impact_family/friend_referral',
  'impact_college_reputation',
  '

In [27]:
# Dropping the irrelevant columns
df_cleaned = df_renamed.drop(columns=irrelevant + less_relevant)

# Displaying the first few rows of the cleaned dataframe
df_cleaned.head()


Unnamed: 0,course_enrolled,heard_from_ccm_web_site,heard_from_social_media,heard_from_community_event,heard_from_family_member_or_friend,heard_from_current_ccm_student,heard_from_ccm_alumni,heard_from_high_school_teacher,heard_from_high_school_counselor,impact_affordable_cost,...,info_from_high_school_teacher,info_from_ccm_information_technologies_website,info_from_ccm_admissions,info_from_ccm_advisor/counselor,major_change_details,interest_in_computing_classes,reason_interest_in_computing,gender,race_ethnicity,age
0,CMP 128 Computer Science I,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Some Impact,...,Yes,Yes,Yes,Yes,First Choice,,,Man,White/Caucasian,19-20
1,,No,No,No,No,Yes,No,No,No,Some Impact,...,No,Yes,No,Yes,First Choice,,,Man,White/Caucasian,25-34
2,CMP 128 Computer Science I,No,No,No,No,No,No,Yes,Yes,High Impact,...,,,,,,1.0,Im really only taking this class for the requi...,Woman,White/Caucasian,19-20
3,CMP 128 Computer Science I,No,No,Yes,No,Yes,No,No,No,High Impact,...,Yes,No,Yes,No,First Choice,,,Man,Asian,18 and younger
4,CMP 128 Computer Science I,Yes,No,No,Yes,No,No,Yes,No,,...,No,No,No,No,First Choice,,,Man,White/Caucasian,25-34


In [28]:
# Checking unique values in 'race_ethnicity' and 'course_enrolled' columns
unique_race = df_cleaned['race_ethnicity'].unique()
unique_course = df_cleaned['course_enrolled'].unique()

unique_race, unique_course


(array(['White/Caucasian', 'Asian', 'Choose not to reply',
        'Hispanic or Latino;White/Caucasian', 'Hispanic or Latino',
        'American Indian/Native American/Alaska Native', 'Multi-Racial',
        'American Indian/Native American/Alaska Native;Multi-Racial',
        'Black/African American;White/Caucasian;Multi-Racial',
        'Black/African American',
        'Hispanic or Latino;American Indian/Native American/Alaska Native;Asian;Black/African American;Native Hawaiian/Other Pacific Islander;White/Caucasian',
        'Hispanic or Latino;Black/African American',
        'White/Caucasian;Multi-Racial',
        'Asian;Native Hawaiian/Other Pacific Islander;White/Caucasian',
        'Asian;White/Caucasian', 'Hispanic or Latino;Multi-Racial',
        'Hispanic or Latino;American Indian/Native American/Alaska Native;White/Caucasian;Multi-Racial',
        'American Indian/Native American/Alaska Native;Native Hawaiian/Other Pacific Islander;Multi-Racial',
        'Asian;Native Hawa

In [29]:
# Cleaning and condensing the 'race_ethnicity' column

# Function to group race/ethnicity values
def group_race(race):
    if pd.isnull(race):
        return race
    elif "Multi-Racial" in race or ";" in race:
        return "Multi-Racial"
    elif "Hispanic" in race:
        return "Hispanic or Latino"
    elif "Asian" in race:
        return "Asian"
    elif "White/Caucasian" in race:
        return "White/Caucasian"
    elif "Black/African American" in race:
        return "Black/African American"
    elif "American Indian/Native American/Alaska Native" in race:
        return "Native American"
    else:
        return race

# Applying the grouping function to the 'race_ethnicity' column
df_cleaned['race_ethnicity'] = df_cleaned['race_ethnicity'].apply(group_race)

# Checking the updated unique values in 'race_ethnicity'
unique_race_updated = df_cleaned['race_ethnicity'].unique()
unique_race_updated


array(['White/Caucasian', 'Asian', 'Choose not to reply', 'Multi-Racial',
       'Hispanic or Latino', 'Native American', 'Black/African American'],
      dtype=object)

In [30]:
# Cleaning and condensing the 'course_enrolled' column

# Function to group course values
def group_course(course):
    if pd.isnull(course):
        return course
    elif "Computer Science" in course:
        return "Computer Science"
    elif "Programming" in course or "Python" in course:
        return "Programming"
    elif "Internet & Web Page Design" in course:
        return "Web Design"
    elif "Intro to IT" in course:
        return "IT Introduction"
    elif "Information Security" in course:
        return "Information Security"
    else:
        return course

# Applying the grouping function to the 'course_enrolled' column
df_cleaned['course_enrolled'] = df_cleaned['course_enrolled'].apply(group_course)

# Checking the updated unique values in 'course_enrolled'
unique_course_updated = df_cleaned['course_enrolled'].unique()
unique_course_updated


array(['Computer Science', nan, 'Web Design', 'Programming',
       'IT Introduction', 'Information Security'], dtype=object)

In [31]:
# Checking for duplicate rows in the dataframe
duplicate_rows = df_cleaned[df_cleaned.duplicated()]

duplicate_rows.shape[0]


0

In [32]:
df_cleaned

Unnamed: 0,course_enrolled,heard_from_ccm_web_site,heard_from_social_media,heard_from_community_event,heard_from_family_member_or_friend,heard_from_current_ccm_student,heard_from_ccm_alumni,heard_from_high_school_teacher,heard_from_high_school_counselor,impact_affordable_cost,...,info_from_high_school_teacher,info_from_ccm_information_technologies_website,info_from_ccm_admissions,info_from_ccm_advisor/counselor,major_change_details,interest_in_computing_classes,reason_interest_in_computing,gender,race_ethnicity,age
0,Computer Science,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Some Impact,...,Yes,Yes,Yes,Yes,First Choice,,,Man,White/Caucasian,19-20
1,,No,No,No,No,Yes,No,No,No,Some Impact,...,No,Yes,No,Yes,First Choice,,,Man,White/Caucasian,25-34
2,Computer Science,No,No,No,No,No,No,Yes,Yes,High Impact,...,,,,,,1.0,Im really only taking this class for the requi...,Woman,White/Caucasian,19-20
3,Computer Science,No,No,Yes,No,Yes,No,No,No,High Impact,...,Yes,No,Yes,No,First Choice,,,Man,Asian,18 and younger
4,Computer Science,Yes,No,No,Yes,No,No,Yes,No,,...,No,No,No,No,First Choice,,,Man,White/Caucasian,25-34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,Computer Science,No,No,No,Yes,No,Yes,Yes,Yes,High Impact,...,No,No,No,No,exercise science,,,Man,Multi-Racial,19-20
449,Computer Science,No,Yes,No,No,No,No,No,No,High Impact,...,No,No,No,No,First Choice,,,Man,Hispanic or Latino,21-24
450,Computer Science,Don't recall,No,No,No,Don't recall,Don't recall,Yes,Yes,High Impact,...,Don't recall,Yes,Don't recall,No,Biology,,,Man,Multi-Racial,18 and younger
451,Computer Science,No,No,No,Yes,No,No,No,No,Some Impact,...,No,Yes,Yes,No,First Choice,,,Man,White/Caucasian,19-20


In [33]:
# Save the df_cleaned data into a CSV file.

df_cleaned.to_csv('cleaned_data.csv', index=False)