In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
df_school = pd.read_csv('../dataset/dataset.csv')

In [25]:
df_school['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [26]:
df_school['Target'] = np.where(df_school['Target'] == 'Dropout', 'YES', 'NO')
# Convert 'Dropout_Flag' to numeric values: 1 for 'YES', 0 for 'NO'
df_school['Target'] = df_school['Target'].apply(lambda x: 1 if x == 'YES' else 0)



In [27]:
df_school['Target'].unique()

array([1, 0], dtype=int64)

In [28]:


column_mapping = {
    'Marital status': 'MaritalStat',
    'Application mode': 'AppMode',
    'Application order': 'AppOrder',
    'Course': 'Course',
    'Daytime/evening attendance': 'DayEveningAtt',
    'Previous qualification': 'PrevQual',
    'Nacionality': 'Nationality',  
    "Mother's qualification": 'MotherQual',
    "Father's qualification": 'FatherQual',
    "Mother's occupation": 'MotherOcc',
    "Father's occupation": 'FatherOcc',
    'Displaced': 'Displaced',
    'Educational special needs': 'EduNeeds',
    'Debtor': 'Debtor',
    'Tuition fees up to date': 'FeesUpdated',
    'Gender': 'Gender',
    'Scholarship holder': 'Scholarship',
    'Age at enrollment': 'AgeEnroll',
    'International': 'International',
    'Curricular units 1st sem (credited)': 'CU1Credited',
    'Curricular units 1st sem (enrolled)': 'CU1Enrolled',
    'Curricular units 1st sem (evaluations)': 'CU1Evaluations',
    'Curricular units 1st sem (approved)': 'CU1Approved',
    'Curricular units 1st sem (grade)': 'CU1Grade',
    'Curricular units 1st sem (without evaluations)': 'CU1NoEvals',
    'Curricular units 2nd sem (credited)': 'CU2Credited',
    'Curricular units 2nd sem (enrolled)': 'CU2Enrolled',
    'Curricular units 2nd sem (evaluations)': 'CU2Evaluations',
    'Curricular units 2nd sem (approved)': 'CU2Approved',
    'Curricular units 2nd sem (grade)': 'CU2Grade',
    'Curricular units 2nd sem (without evaluations)': 'CU2NoEvals',
    'Unemployment rate': 'UnempRate',
    'Inflation rate': 'InflationRate',
    'GDP': 'GDP',
    'Target': 'Target'
}



df_school.rename(columns=column_mapping, inplace=True)


In [29]:
df_school.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   MaritalStat     4424 non-null   int64  
 1   AppMode         4424 non-null   int64  
 2   AppOrder        4424 non-null   int64  
 3   Course          4424 non-null   int64  
 4   DayEveningAtt   4424 non-null   int64  
 5   PrevQual        4424 non-null   int64  
 6   Nationality     4424 non-null   int64  
 7   MotherQual      4424 non-null   int64  
 8   FatherQual      4424 non-null   int64  
 9   MotherOcc       4424 non-null   int64  
 10  FatherOcc       4424 non-null   int64  
 11  Displaced       4424 non-null   int64  
 12  EduNeeds        4424 non-null   int64  
 13  Debtor          4424 non-null   int64  
 14  FeesUpdated     4424 non-null   int64  
 15  Gender          4424 non-null   int64  
 16  Scholarship     4424 non-null   int64  
 17  AgeEnroll       4424 non-null   i

In [30]:
# Gender replacement
gender_replace = {
    1: 'male',
    0: 'female'
}
df_school['Gender'] = df_school['Gender'].replace(gender_replace)

# Nationality replacement
nationality_replace = {
    1: 'Portuguese', 2: 'German', 3: 'Spanish', 4: 'Italian', 5: 'Dutch', 6: 'English',
    7: 'Lithuanian', 8: 'Angolan', 9: 'Cape Verdean', 10: 'Guinean', 11: 'Mozambican',
    12: 'Santomean', 13: 'Turkish', 14: 'Brazilian', 15: 'Romanian', 16: 'Moldova (Republic of)',
    17: 'Mexican', 18: 'Ukrainian', 19: 'Russian', 20: 'Cuban', 21: 'Colombian'
}
df_school['Nationality'] = df_school['Nationality'].replace(nationality_replace)

In [31]:
# Replace the values in 'Marital status'
marital_mapping = {
    1: 'Single',
    2: 'Married',
    3: 'Widower',
    4: 'Divorced',
    5: 'Facto union',
    6: 'Legally separated'
}

df_school['MaritalStat'] = df_school['MaritalStat'].replace(marital_mapping)


In [32]:
qualification_mapping = {
    1: 'Secondary Education',
    2: 'Higher Education - Undergraduate',
    3: 'Higher Education - Undergraduate',
    4: 'Higher Education - Graduate',
    5: 'Higher Education - Graduate',
    6: 'Higher Education - Undergraduate',
    7: 'Primary Education',
    8: 'Primary Education',
    9: 'Primary Education',
    10: 'Secondary Education',
    11: 'Secondary Education',
    12: 'Secondary Education',
    13: 'Secondary Education',
    14: 'Secondary Education',
    15: 'Secondary Education',
    16: 'Vocational/Technical',
    17: 'Secondary Education',
    18: 'Primary Education',
    19: 'Secondary Education',
    20: 'Primary Education',
    21: 'Primary Education',
    22: 'Secondary Education',
    23: 'Secondary Education',
    24: 'Unknown',
    25: 'Primary Education',
    26: 'Primary Education',
    27: 'Primary Education',
    28: 'Primary Education',
    29: 'Vocational/Technical',
    30: 'Higher Education - Undergraduate',
    31: 'Higher Education - Undergraduate',
    32: 'Higher Education - Undergraduate',
    33: 'Higher Education - Graduate',
    34: 'Higher Education - Graduate'
}

# Grouping the qualification categories
grouped_qualifications = {
    'Primary Education': 'Primary Education',
    'Secondary Education': 'Secondary Education',
    'Higher Education - Undergraduate': 'Higher Education',
    'Higher Education - Graduate': 'Higher Education',
    'Vocational/Technical': 'Vocational/Technical',
    'Unknown': 'Unknown'
}


df_school["FatherQual"] = df_school["FatherQual"].map(qualification_mapping)
df_school["FatherQual"] = df_school["FatherQual"].replace(grouped_qualifications)

df_school["MotherQual_Cate"] = df_school["MotherQual"].map(qualification_mapping)
df_school["MotherQual_Cate"] = df_school["MotherQual_Cate"].replace(grouped_qualifications)


In [39]:
# Mapping courses
course_mapping = {
    1: 'Biofuel Production Technologies',
    2: 'Animation and Multimedia Design',
    3: 'Social Service (evening attendance)',
    4: 'Agronomy',
    5: 'Communication Design',
    6: 'Veterinary Nursing',
    7: 'Informatics Engineering',
    8: 'Equiniculture',
    9: 'Management',
    10: 'Social Service',
    11: 'Tourism',
    12: 'Nursing',
    13: 'Oral Hygiene',
    14: 'Advertising and Marketing Management',
    15: 'Journalism and Communication',
    16: 'Basic Education',
    17: 'Management (evening attendance)'
}

# Define a new mapping from course names to broader categories
new_course_mapping = {
    'Biofuel Production Technologies': 'Science and Technology',
    'Animation and Multimedia Design': 'Arts, Design, and Social Sciences',
    'Social Service (evening attendance)': 'Arts, Design, and Social Sciences',
    'Agronomy': 'Science and Technology',
    'Communication Design': 'Arts, Design, and Social Sciences',
    'Veterinary Nursing': 'Health, Business, and Management',
    'Informatics Engineering': 'Science and Technology',
    'Equiniculture': 'Arts, Design, and Social Sciences',
    'Management': 'Health, Business, and Management',
    'Social Service': 'Arts, Design, and Social Sciences',
    'Tourism': 'Arts, Design, and Social Sciences',
    'Nursing': 'Health, Business, and Management',
    'Oral Hygiene': 'Health, Business, and Management',
    'Advertising and Marketing Management': 'Health, Business, and Management',
    'Journalism and Communication': 'Arts, Design, and Social Sciences',
    'Basic Education': 'Arts, Design, and Social Sciences',
    'Management (evening attendance)': 'Health, Business, and Management'
}

# First map the course IDs to names, then map names to broader categories
df_school['Course'] = df_school['Course'].map(course_mapping)
df_school['Course'] = df_school['Course'].map(new_course_mapping)


In [40]:
# Mapping 'Daytime/evening attendance' column
df_school['DayEveningAtt'] = df_school['DayEveningAtt'].replace({1: 'daytime', 0: 'evening'})


In [35]:
df_school['PrevQual'].unique()

array([ 1, 12, 16, 14,  8,  3, 15,  2,  4,  9, 17, 11,  6,  7, 13,  5, 10],
      dtype=int64)

In [37]:
prev_qual_mapping = {
    1: 'Secondary Education',
    12: 'Basic Education',
    16: 'Higher Education (Undergraduate)',
    14: 'Other or Vocational Training',
    8: 'Incomplete Secondary Education',
    3: 'Higher Education (Undergraduate)',
    15: 'Higher Education (Undergraduate)',
    2: 'Higher Education (Undergraduate)',
    4: 'Higher Education (Postgraduate)',
    9: 'Other or Vocational Training',
    17: 'Higher Education (Postgraduate)',
    11: 'Basic Education',
    6: 'Other or Vocational Training',
    7: 'Incomplete Secondary Education',
    13: 'Basic Education',
    5: 'Higher Education (Postgraduate)',
    10: 'Basic Education'
}

df_school['PrevQual'] = df_school['PrevQual'].map(prev_qual_mapping)


In [41]:
df_school['Scholarship'].unique()

array([0, 1], dtype=int64)

In [None]:
# Map 0 to 'No' and 1 to 'Yes'
scholarship_mapping = {0: 'No', 1: 'Yes'}

# Apply the mapping to the 'Scholarship' column
df_school['Scholarship'] = df_school['Scholarship'].map(scholarship_mapping)


In [43]:
df_school['AppMode'].unique()

array([ 8,  6,  1, 12,  9, 17, 15, 16, 14,  4, 13,  7,  3,  2,  5, 18, 10,
       11], dtype=int64)

In [None]:
app_mode_mapping = {
    1: 'General Admission', 8: 'General Admission', 9: 'General Admission', 12: 'General Admission',
    2: 'Special Contingents or Conditions', 3: 'Special Contingents or Conditions', 4: 'Special Contingents or Conditions', 
    5: 'Special Contingents or Conditions', 6: 'Special Contingents or Conditions', 7: 'Special Contingents or Conditions', 
    10: 'Special Contingents or Conditions', 11: 'Special Contingents or Conditions',
    13: 'Course/Institution Changes', 14: 'Course/Institution Changes', 15: 'Course/Institution Changes', 
    16: 'Course/Institution Changes', 17: 'Course/Institution Changes', 18: 'Course/Institution Changes'
}

df_school['AppMode'] = df_school['AppMode'].map(app_mode_mapping)


In [44]:
df_school['AppOrder'].unique()

array([5, 1, 2, 4, 3, 6, 9, 0], dtype=int64)

In [45]:
df_school['MotherOcc'].unique()

array([ 6,  4, 10,  8,  5,  2, 16,  1,  7,  3, 12,  9, 20, 28, 13, 29, 23,
       32, 30, 18, 24, 19, 11, 21, 15, 27, 31, 14, 22, 17, 26, 25],
      dtype=int64)

In [None]:
occ_mapping = {
    # Mapping for Highly Skilled or Professional Occupations
    1: 'Highly Skilled/Professional', 2: 'Highly Skilled/Professional', 3: 'Highly Skilled/Professional', 
    4: 'Highly Skilled/Professional', 5: 'Highly Skilled/Professional', 16: 'Highly Skilled/Professional', 
    20: 'Highly Skilled/Professional', 21: 'Highly Skilled/Professional', 19: 'Highly Skilled/Professional', 
    23: 'Highly Skilled/Professional', 24: 'Highly Skilled/Professional', 25: 'Highly Skilled/Professional', 
    26: 'Highly Skilled/Professional', 22: 'Highly Skilled/Professional', 17: 'Highly Skilled/Professional', 
    18: 'Highly Skilled/Professional', 29: 'Highly Skilled/Professional', 28: 'Highly Skilled/Professional', 
    27: 'Highly Skilled/Professional',

    # Mapping for Skilled or Semi-Skilled Occupations
    6: 'Skilled/Semi-Skilled', 7: 'Skilled/Semi-Skilled', 8: 'Skilled/Semi-Skilled', 
    9: 'Skilled/Semi-Skilled', 30: 'Skilled/Semi-Skilled', 31: 'Skilled/Semi-Skilled', 
    32: 'Skilled/Semi-Skilled', 33: 'Skilled/Semi-Skilled', 34: 'Skilled/Semi-Skilled', 
    36: 'Skilled/Semi-Skilled', 37: 'Skilled/Semi-Skilled', 38: 'Skilled/Semi-Skilled', 
    39: 'Skilled/Semi-Skilled', 40: 'Skilled/Semi-Skilled', 41: 'Skilled/Semi-Skilled', 
    42: 'Skilled/Semi-Skilled',

    # Mapping for Unskilled Occupations
    10: 'Unskilled', 12: 'Unskilled', 43: 'Unskilled', 
    44: 'Unskilled', 45: 'Unskilled', 46: 'Unskilled'
}

df_school['MotherOcc'] = df_school['MotherOcc'].map(occ_mapping)
# Example of mapping
df_school['FatherOcc'] = df_school['FatherOcc'].map(occ_mapping)


In [47]:
df_school['Displaced'].unique()

array([1, 0], dtype=int64)

In [None]:
# Map 0 to 'No' and 1 to 'Yes'
Displaced_mapping = {0: 'No', 1: 'Yes'}

# Apply the mapping to the 'Scholarship' column
df_school['Displaced'] = df_school['Displaced'].map(Displaced_mapping)


In [48]:
df_school['EduNeeds'].unique() 

array([0, 1], dtype=int64)

In [None]:
# Map 0 to 'No' and 1 to 'Yes'
EduNeeds_mapping = {0: 'No', 1: 'Yes'}

# Apply the mapping to the 'Scholarship' column
df_school['EduNeeds'] = df_school['EduNeeds'].map(EduNeeds_mapping)


In [49]:
df_school['Debtor'].unique() 

array([0, 1], dtype=int64)

In [None]:
# Map 0 to 'No' and 1 to 'Yes'
Debtor_mapping = {0: 'No', 1: 'Yes'}

# Apply the mapping to the 'Scholarship' column
df_school['Debtor'] = df_school['Debtor'].map(Debtor_mapping)


In [50]:
df_school['FeesUpdated'].unique() 

array([1, 0], dtype=int64)

In [None]:
# Map 0 to 'No' and 1 to 'Yes'
FeesUpdated_mapping = {0: 'No', 1: 'Yes'}

# Apply the mapping to the 'Scholarship' column
df_school['FeesUpdated'] = df_school['FeesUpdated'].map(FeesUpdated_mapping)


In [51]:
df_school['International'].unique() 

array([0, 1], dtype=int64)

In [None]:
# Map 0 to 'No' and 1 to 'Yes'
International_mapping = {0: 'No', 1: 'Yes'}

# Apply the mapping to the 'Scholarship' column
df_school['International'] = df_school['International'].map(International_mapping)


In [38]:
df_school.head()

Unnamed: 0,MaritalStat,AppMode,AppOrder,Course,DayEveningAtt,PrevQual,Nationality,MotherQual,FatherQual,MotherOcc,...,CU2Evaluations,CU2Approved,CU2Grade,CU2NoEvals,UnempRate,InflationRate,GDP,Target,MotherQual_Cate,PrevQual_Category
0,Single,8,5,Arts and Design,daytime,1,Portuguese,13,Secondary Education,6,...,0,0,0.0,0,10.8,1.4,1.74,1,Secondary Education,Secondary Education
1,Single,6,1,Social Sciences and Services,daytime,1,Portuguese,1,Higher Education,4,...,6,6,13.666667,0,13.9,-0.3,0.79,0,Secondary Education,Secondary Education
2,Single,1,5,Arts and Design,daytime,1,Portuguese,22,Primary Education,10,...,0,0,0.0,0,10.8,1.4,1.74,1,Secondary Education,Secondary Education
3,Single,8,2,Communication and Media Studies,daytime,1,Portuguese,23,Primary Education,6,...,10,5,12.4,0,9.4,-0.8,-3.12,0,Secondary Education,Secondary Education
4,Married,12,1,Social Sciences and Services,evening,1,Portuguese,22,Primary Education,10,...,6,6,13.0,0,13.9,-0.3,0.79,0,Secondary Education,Secondary Education
