In [1]:
## Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import matplotlib as mpl
import warnings

## Importing Specific Modules from scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## Importing Metrics and Preprocessing Tools

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score, roc_curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Additional Import and Setup
warnings.filterwarnings('ignore')
%matplotlib inline

## Data Cleaning and Preprocessing

In [2]:
## read a CSV file 
df = pd.read_csv("data.csv")

In [3]:
## Checking for null values
df.isnull().sum()

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance\t                      0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

In [6]:
# Check for duplicates
duplicates = df.duplicated()
# Remove duplicates
df.drop_duplicates(inplace=True)
# Round all float columns to their nearest whole number (0 decimal places)
df = df.round(0)
# Convert all float columns to integers
df[['Admission grade', 'Previous qualification (grade)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)', 'Unemployment rate', 'Inflation rate', 'GDP']] = df[['Admission grade', 'Previous qualification (grade)', 'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)', 'Unemployment rate', 'Inflation rate', 'GDP']].astype(np.int64) 
# Rename the column
df = df.rename(columns={'Nacionality': 'Nationality'})
# Change 'Dropout' to 0, 'Graduate' to 1 and 'Enrolled' to 2 in 'Target' column
df = df.replace({ 'Target' : { 'Dropout' : 0, 'Graduate' : 1, 'Enrolled' : 2 } })
# Drop all of the columns that are not required
# Drop columns by index
df1 = df.drop(columns=['Application mode', 'Application order', 'Debtor', 'Marital status', 'Unemployment rate', 'Inflation rate', ], axis=1)
# Save the 'data' DataFrame to a new CSV file
df1.to_csv('cleandata1.csv', index=False)
df1

Unnamed: 0,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nationality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,...,Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),GDP,Target
0,171,1,1,122,1,19,12,5,9,127,...,0,0,0,0,0,0,0,0,2,0
1,9254,1,1,160,1,1,3,3,3,142,...,14,0,0,6,6,6,14,0,1,1
2,9070,1,1,122,1,37,37,9,9,125,...,0,0,0,6,0,0,0,0,2,0
3,9773,1,1,122,1,38,37,5,3,120,...,13,0,0,6,10,5,12,0,-3,1
4,8014,0,1,100,1,37,38,9,9,142,...,12,0,0,6,6,6,13,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,9773,1,1,125,1,1,1,5,4,122,...,14,0,0,6,8,5,13,0,-4,1
4420,9773,1,1,120,105,1,1,9,9,119,...,12,0,0,6,6,2,11,0,2,0
4421,9500,1,1,154,1,37,37,9,9,150,...,15,0,0,8,9,1,14,0,1,0
4422,9147,1,1,180,1,37,37,7,4,154,...,14,0,0,5,6,5,12,0,-3,1
