#### Luqman

#### Basic preprocessing, nulls, duplicates, fixing names and separating the data columns

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# This line configures matplotlib to show figures embedded in the Jupyter notebook, 
# instead of opening a new window for each figure.
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
sp = pd.read_csv("Student performance (Polytechnic Institute of Portalegre).csv")
sp.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
sp.describe

<bound method NDFrame.describe of       Marital status  Application mode  Application order  Course  \
0                  1                17                  5     171   
1                  1                15                  1    9254   
2                  1                 1                  5    9070   
3                  1                17                  2    9773   
4                  2                39                  1    8014   
...              ...               ...                ...     ...   
4419               1                 1                  6    9773   
4420               1                 1                  2    9773   
4421               1                 1                  1    9500   
4422               1                 1                  1    9147   
4423               1                10                  1    9773   

      Daytime/evening attendance\t  Previous qualification  \
0                                1                       1   
1            

### Handling Missing data/null values

In [4]:
has_nan = sp.isnull().values.any()
print(has_nan)

False


In [5]:
null_values = sp.isnull()
print(null_values)

      Marital status  Application mode  Application order  Course  \
0              False             False              False   False   
1              False             False              False   False   
2              False             False              False   False   
3              False             False              False   False   
4              False             False              False   False   
...              ...               ...                ...     ...   
4419           False             False              False   False   
4420           False             False              False   False   
4421           False             False              False   False   
4422           False             False              False   False   
4423           False             False              False   False   

      Daytime/evening attendance\t  Previous qualification  \
0                            False                   False   
1                            False             

In [6]:
null_count = sp.isnull().sum()
print(null_count)

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance\t                      0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

#### This dataset has no null values

In [11]:
duplicates = sp.duplicated().sum()
print(duplicates)

0


#### Duplicate Data: This dataset has no duplicates

In [12]:
sp['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [13]:
# Changing target values to numerical numbers
Target_Status = pd.DataFrame({'Target':['Dropout', 'Graduate', 'Enrolled'], 'Target_status': [1,2,3]})
Target_Status

Unnamed: 0,Target,Target_status
0,Dropout,1
1,Graduate,2
2,Enrolled,3


In [16]:
clean_sp = sp.merge(Target_Status, on='Target')

In [18]:
# Drop the orinal target collumn
clean_sp.head()
num_sp = clean_sp.drop(columns=['Target'])
num_sp.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target_status
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,1
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,2
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,1
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,2
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,2


In [22]:
num_sp = num_sp.rename(columns={'Nacionality': 'Nationality', }) # Fix mispelling

In [26]:
num_sp = num_sp.rename(columns={'Daytime/evening attendance\t': 'Daytime/evening attendance'}) #Getting rid of a tab

In [36]:
# Data Relevant to Q1
spQ1 = num_sp[['Marital status', 'Daytime/evening attendance', 'Nationality', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 
               'Gender', 'Scholarship holder', 'International', 'Age at enrollment', 'Target_status' ]]
spQ1.head()

Unnamed: 0,Marital status,Daytime/evening attendance,Nationality,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,International,Age at enrollment,Target_status
0,1,1,1,1,0,0,1,1,0,0,20,1
1,1,1,1,1,0,0,0,1,0,0,19,2
2,1,1,1,1,0,0,0,1,0,0,19,1
3,1,1,1,1,0,0,1,0,0,0,20,2
4,2,0,1,0,0,0,1,0,0,0,45,2


In [37]:
# Data Relevant to Q2
spQ2 = num_sp[['Mother\'s qualification', 'Father\'s qualification', 'Mother\'s occupation', 'Father\'s occupation', 'Unemployment rate', 'Inflation rate', 
               'GDP', 'Target_status' ]]
spQ2.head()

Unnamed: 0,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Unemployment rate,Inflation rate,GDP,Target_status
0,19,12,5,9,10.8,1.4,1.74,1
1,1,3,3,3,13.9,-0.3,0.79,2
2,37,37,9,9,10.8,1.4,1.74,1
3,38,37,5,3,9.4,-0.8,-3.12,2
4,37,38,9,9,13.9,-0.3,0.79,2


In [38]:
# Data Relevant to Q3
spQ3 = num_sp[['Application mode', 'Application order', 'Course', 'Previous qualification', 'Previous qualification (grade)', 'Admission grade', 'Target_status' ]]
spQ3.head()

Unnamed: 0,Application mode,Application order,Course,Previous qualification,Previous qualification (grade),Admission grade,Target_status
0,17,5,171,1,122.0,127.3,1
1,15,1,9254,1,160.0,142.5,2
2,1,5,9070,1,122.0,124.8,1
3,17,2,9773,1,122.0,119.6,2
4,39,1,8014,1,100.0,141.5,2


In [39]:
# Data Relevant to Q4
spQ4 = num_sp[['Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 
               'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 
               'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 
               'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Target_status' ]]
spQ4.head()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Target_status
0,0,0,0,0,0.0,0,0,0,0,0,0.0,0,1
1,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,2
2,0,6,0,0,0.0,0,0,6,0,0,0.0,0,1
3,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,2
4,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,2


In [42]:
spQ1.to_csv('spQ1.csv', index=False)

In [43]:
spQ2.to_csv('spQ2.csv', index=False)

In [44]:
spQ3.to_csv('spQ3.csv', index=False)

In [45]:
spQ4.to_csv('spQ4.csv', index=False)