##### Note!!! All output has been cleared to reduce size for Github upload

#### Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
from pandas.plotting import parallel_coordinates

#### Load dataset (The already cleaned and transformed dataset from week 1)

In [None]:
school_data = pd.read_csv("cleaned_transformed_dataset.csv")

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
#check first few rows
school_data.head()

#### Univariate Analysis

##### - Descriptive Statistics

In [None]:
school_data.describe().T

In [None]:
school_data.columns

In [None]:
numerical_variables = ["Application order","Previous qualification (grade)","Admission grade",
    "Age at enrollment","Curricular units 1st sem (credited)","Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)","Curricular units 2nd sem (credited)","Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)","Curricular units 2nd sem (approved)","Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)","Unemployment rate","Inflation rate","GDP",'Total Curricular Units 1st Semester',
    'Total Curricular Units 2nd Semester', 'Total Credits Earned', 'Total Units Enrolled', 'Weighted Grade 1st Semester',
     'Weighted Grade 2nd Semester', 'GPA'
]

##### - Histograms for numerical variables

In [None]:
import warnings

# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Number of columns to plot
num_columns = len(numerical_variables)

# Determine the number of rows and columns for the subplot grid
ncols = 4
nrows = int(np.ceil(num_columns / ncols))

# Create a larger figure with subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5 * nrows))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot each column
for i, col in enumerate(numerical_variables):
    sns.histplot(school_data[col], ax=axes[i], kde=True, bins=20)
    axes[i].set_title(f"Distribution of {col}")
    axes[i].grid(False)

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust spacing between subplots
plt.tight_layout()
plt.savefig('Histograms for numerical variables.png', dpi=300)
plt.show()

##### - Boxplot for numerical variables

In [None]:
# Number of columns to plot
num_columns = len(numerical_variables)

# Determine the number of columns and rows for each figure
fig_columns = 4
fig_rows = 5

# Create subplots in chunks
for start in range(0, num_columns, fig_columns * fig_rows):
    end = min(start + fig_columns * fig_rows, num_columns)
    fig, axes = plt.subplots(nrows=fig_rows, ncols=fig_columns, figsize=(15, 5 * fig_rows))
    axes = axes.flatten()

    # Plot each column in the current figure
    for i, col in enumerate(numerical_variables[start:end]):
        # Create box plot
        sns.boxplot(x=school_data[col], ax=axes[i])
        axes[i].set_title(f"Box Plot of {col}")
        axes[i].grid(False)

    # Remove any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust spacing between subplots
    plt.tight_layout()
    plt.savefig('Boxplot for numerical variables.png', dpi=300)
    plt.show()

##### - Bar charts for categorical variables

In [None]:
categorical_variables = ["Marital status","Application mode","Course","Previous qualification",
                         "Nationality","Mother's qualification","Father's qualification", "Daytime/evening attendance",
                         "Displaced", "Debtor", "Tuition fees up to date", "Gender", "Scholarship holder", "International",
                         "Mother's occupation","Father's occupation","Target"
                        ]

In [None]:
# Determine the number of columns and rows for each figure
fig_columns = 3
fig_rows = 4

# Create subplots in chunks
for start in range(0, len(categorical_variables), fig_columns * fig_rows):
    end = min(start + fig_columns * fig_rows, len(categorical_variables))
    fig, axes = plt.subplots(nrows=fig_rows, ncols=fig_columns, figsize=(18, 5 * fig_rows))
    axes = axes.flatten()

    # Plot each categorical variable in the current figure
    for i, col in enumerate(categorical_variables[start:end]):
        # Create horizontal bar plot if the x-axis labels are too jam-packed
        if len(school_data[col].unique()) > 10:  # Adjust threshold if necessary
            sns.countplot(data=school_data, y=col, ax=axes[i], palette='viridis')
            axes[i].set_title(f"Bar Plot of {col}")
        else:
            sns.countplot(data=school_data, x=col, ax=axes[i], palette='viridis')
            axes[i].set_title(f"Bar Plot of {col}")
        
        # Rotate x-axis labels for better readability if using vertical bars
        if len(school_data[col].unique()) <= 10:
            axes[i].tick_params(axis='x', rotation=45)
        
        axes[i].grid(False)

    # Remove any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust spacing between subplots
    plt.tight_layout()
    plt.savefig('Bar charts for categorical variables.png', dpi=300)
    plt.show()

#### Bivariate Analysis

##### - Scatter plots for pairs of numerical variables

In [None]:
num_vars = len(numerical_variables)
num_cols = 3
num_rows = 10

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 25))  # Adjusted figure size

# Flatten the axes array for easier indexing
axes = axes.flatten()

# Generate all unique pairs of numerical variables
pairs = [(var1, var2) for var1 in numerical_variables for var2 in numerical_variables if var1 != var2]

for idx, (var1, var2) in enumerate(pairs):
    if idx < len(axes):  # Check if we have enough subplots
        ax = axes[idx]
        ax.scatter(school_data[var1], school_data[var2])
        ax.set_xlabel(var1, fontsize=8)
        ax.set_ylabel(var2, fontsize=8)
        ax.set_title(f"{var1} vs. {var2}", fontsize=10)
    else:
        break  # Exit loop if we have used all available subplots

# Hide any remaining empty subplots
for ax in axes[len(pairs):]:
    ax.axis('off')

# Adjust layout to reduce space
plt.tight_layout(pad=1.0)  # Reduced padding between subplots
plt.subplots_adjust(hspace=0.5, wspace=0.3)  # Reduced space between rows and columns
plt.savefig('Scatter plots for pairs of numerical variables.png', dpi=300)
plt.show()


##### - Box plots of numerical variables grouped by categorical variables

In [None]:
# Create box plots
for num_var in numerical_variables:
    for cat_var in categorical_variables:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=cat_var, y=num_var, data=school_data)
        plt.title(f'Box Plot of {num_var} by {cat_var}')
        plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

        plt.savefig("boxplot.png", dpi=300, bbox_inches='tight')
        plt.tight_layout()
        plt.show()

##### - Correlation analysis

In [None]:
numeric_data = school_data.select_dtypes(include=['number'])
correlation_matrix = numeric_data.corr(method = "pearson")

In [None]:
correlation_matrix

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(correlation_matrix, vmin = -1, vmax = 1, annot = False, fmt = '.2f', cmap = "YlGnBu", cbar = True, linewidths = 0.5)
plt.title("Pearson Correlation")
plt.savefig("Heatmap.png", dpi=300, bbox_inches='tight')
plt.show()

##### - Chi-square tests for categorical variables

In [None]:
# Loop through each pair of categorical variables
for var1 in categorical_variables:
    for var2 in categorical_variables:
        if var1 != var2:  # Avoid testing a variable against itself
            # Create a contingency table
            contingency_table = pd.crosstab(school_data[var1], school_data[var2])

            # Perform the chi-square test
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)

            # Print the results
            print(f"Chi-Square Test for {var1} and {var2}")
            print(f"Chi-Square Statistic: {chi2}")
            print(f"P-value: {p_value}")
            print(f"Degrees of Freedom: {dof}")
            print("\n")

#### Multivariate Analysis

##### - Pair plots

In [None]:
sns.set_style("whitegrid")
sns.pairplot(school_data, hue = "Target", height = 3)
plt.show()

- The pairplot is alot and not visible to read. Based on the correlation analysis. I'll pick most relevant columns for the pairplot analysis

In [None]:
subset = ["Curricular units 1st sem (approved)","Curricular units 1st sem (grade)","Curricular units 2nd sem (approved)",
          "Curricular units 2nd sem (grade)", "Tuition fees up to date", "Scholarship holder"
    ]

In [None]:
# Set the seaborn style
sns.set_style("whitegrid")

# Generate a list of colors from the viridis colormap
num_classes = school_data['Target'].nunique()  # Get the number of unique classes in 'Target'
viridis_colors = sns.color_palette("viridis", num_classes)

# Create the pairplot with the viridis color palette
sns.pairplot(school_data[subset + ['Target']], hue="Target", height=3, aspect=1.2, palette=viridis_colors)

# Show the plot
plt.savefig('Pairplot.png', dpi=300)
plt.show()


##### - Principal component analysis (PCA)

In [None]:
features = school_data.drop("Target", axis = 1)

In [None]:
# Standardize the Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)

In [None]:
#Perform PCA
# Fit PCA with 2 components for visualization
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

In [None]:
print(scaled_data.shape)
print(pca_data.shape)

In [None]:
# Get explained variance ratio
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)

In [None]:
# Scatter Plot of the First Two Principal Components
plt.figure(figsize=(8,6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=school_data["Target"], cmap='plasma', alpha=0.7)
plt.title('PCA of Dataset')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar()
plt.savefig('PCA.png', dpi=300)
plt.show()


In [None]:
pca.components_

In [None]:
school_comp = pd.DataFrame(pca.components_, columns = features.columns)
plt.figure(figsize=(12,6))
sns.heatmap(school_comp, cmap = "plasma")
plt.savefig('PCA Heatmap.png', dpi=300)
plt.show()

##### - Parallel coordinates plot

In [None]:
plt.figure(figsize=(14, 8))
parallel_coordinates(school_data, class_column='Target', colormap=plt.get_cmap("Set2"))

# Rotate the x-axis labels to avoid overlapping and set appropriate label sizes
plt.xticks(rotation=90, fontsize=12) 
plt.title('Parallel Coordinates Plot', fontsize=16)
plt.xlabel('Features', fontsize=14)
plt.ylabel('Values', fontsize=14)

# Show the plot
plt.tight_layout()
plt.savefig('Parallel coordinates plot.png', dpi=300)
plt.show()

#### Advanced Visualization

##### - Distribution of Admission Grades

In [None]:
fig = px.histogram(school_data, x='Age at enrollment', nbins=30, title='Distribution of Age at enrollment', height=500,
                  color_discrete_sequence=px.colors.qualitative.Set2) 
fig.show()

In [None]:
fig = px.pie(school_data, names='Marital status', title='Marital Status Distribution',
            color_discrete_sequence=px.colors.qualitative.Set2)
fig.show()

In [None]:
fig = px.pie(school_data, names='Target', title='Target Status Distribution')
fig.show()

In [None]:
scholarship_holder_counts = school_data['Scholarship holder'].value_counts()

# Create a bar chart
fig = px.bar(
    x=scholarship_holder_counts.index,
    y=scholarship_holder_counts.values,
    title='Scholarship holder Breakdown',
    labels={'x': 'Scholarship holder', 'y': 'Count'},
    color_discrete_sequence=px.colors.qualitative.Set3
)

# Show the bar chart
fig.show()

In [None]:
fig = px.pie(school_data, names='Scholarship holder', title='Scholarship holder Distribution',
            color_discrete_sequence=px.colors.qualitative.Set3)
fig.show()

In [None]:
# Categorizing 'Age at enrollment' into age groups
school_data['Age Group'] = pd.cut(school_data['Age at enrollment'], bins=[0, 0.2, 0.4, 0.6, 0.8, 1], 
                         labels=['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1'])

# Plotting the boxplot for GPA by Age Group
fig = px.box(school_data, x='Age Group', y='GPA', title='GPA Distribution by Age Group', height = 500)
fig.show()


In [None]:
# Aggregate data to get counts
count_data = school_data.groupby(['Target', 'Gender']).size().reset_index(name='Count')

# Create grouped bar plot
fig = px.bar(count_data, 
             x="Gender", 
             y="Count", 
             color="Target",
             barmode='group',
             title="Distribution of Target by Gender",
             labels={"Gender": "Gender Value", "Count": "Count", "Target": "Target (0=Dropout, 1=Enrolled, 2=Graduate)"},
             category_orders={"Gender": [0, 1]}, height = 500
            )
fig.show()

In [None]:
# Aggregate data to get counts
count_data = school_data.groupby(['Target', 'Tuition fees up to date']).size().reset_index(name='Count')

# Create grouped bar plot
fig = px.bar(count_data, 
             x="Tuition fees up to date", 
             y="Count", 
             color="Target",
             color_discrete_sequence=px.colors.sequential.Plasma,
             barmode='group',
             title="Distribution of Target by Tuition fees up to date",
             labels={"Tuition fees up to date": "Tuition fees up to date Value", "Count": "Count", "Target": "Target (0=Dropout, 1=Enrolled, 2=Graduate)"},
             category_orders={"Gender": [0, 1]}, height = 500)
fig.show()

In [None]:
fig = px.histogram(school_data, x='Age at enrollment', title='Age Distribution at Enrollment', height = 500,
                  color_discrete_sequence=px.colors.qualitative.Set2)
fig.show()

In [None]:
fig = px.pie(school_data, names='Displaced', title='Displaced Distribution',
            color_discrete_sequence=px.colors.qualitative.Set3)
fig.show()

In [None]:
Daytime_evening_attendance_counts = school_data['Daytime/evening attendance'].value_counts()

# Create a bar chart
fig = px.bar(
    x=Daytime_evening_attendance_counts.index,
    y=Daytime_evening_attendance_counts.values,
    title='Daytime/evening attendance Breakdown',
    labels={'x': 'Daytime/evening attendance', 'y': 'Count'},
    color_discrete_sequence=px.colors.qualitative.Set3, height = 500
)

# Show the bar chart
fig.show()

In [None]:
fig = px.scatter(school_data, x='Age at enrollment', y='GPA', title='Age vs. GPA', height = 500)
fig.show()

In [None]:
fig = px.box(school_data, x='Target', y='Mother\'s qualification', title='Mother\'s Qualification vs Target', height=500)
fig.show()

fig = px.box(school_data, x='Target', y='Father\'s qualification', title='Father\'s Qualification vs Target', height=500)
fig.show()


In [None]:
fig = px.pie(school_data, names='Debtor', title='Percentage of Students with Debt',
            color_discrete_sequence=px.colors.qualitative.Set3)
fig.show()


In [None]:
fig = px.box(school_data, x="course", y="Admission grade", title="Admission Grade by Course", height = 500)
fig.show()


In [None]:
fig = px.scatter(school_data, x='Admission grade', y='GPA', color='Target',
                 title="Admission Grade vs GPA by Target",
                 labels={'Admission grade': 'Admission Grade', 'GPA': 'Final GPA'}, height = 500)
fig.show()


In [None]:
# 1. Bar Chart of Dropout Rates by Marital Status
marital_status_dropout = school_data.groupby(['Marital status', 'Target']).size().unstack()
marital_status_dropout.plot(kind='bar', stacked=True)
fig.update_layout(xaxis_title='Marital Status', yaxis_title='Number of Students')
fig.show()

In [None]:
# Bubble chart for Total Credits Earned vs. GPA
fig = px.scatter(school_data, x='Total Credits Earned', y='GPA', 
                 size='Total Units Enrolled', color='Target', 
                 title='Total Credits Earned vs. GPA (Bubble Size: Total Units Enrolled)', height = 500)
fig.show()
