Import all your packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import f_oneway

Load the dataset

In [None]:
students_df = pd.read_csv("StudentsPerformance.csv")
students_df.head()

Overview of the dataset

In [None]:
print("Dataset shape:", students_df.shape)
print("Columns:", students_df.columns)

Any data cleaning or preprocessing we could do here
e.g. Encoding of categorical data, dropping columns etc

## Univariate Analysis

### Descriptive Stats

In [None]:
students_df.describe()

### Visualisation of Descriptive Stats

#### Histograms

In [None]:
students_df.hist(figsize=(10, 3), layout=(1,3))
plt.tight_layout()
plt.show()

#### Boxplots

In [None]:
students_df.boxplot(figsize=(10, 8))
plt.show()
print("Missing values:", students_df.isnull().sum())

### Bivariate Analysis

#### Scatter Plots

In [None]:
sns.pairplot(students_df)
plt.show()

#### Correlation Matrix

In [None]:
corr_matrix = students_df.iloc[:, -3:].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

### Multivariate Analysis

#### PCA

In [None]:
X = students_df.iloc[:, -3:]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
principalComponents = pca.fit_transform(X_scaled)
plt.figure(figsize=(10, 6))

plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_.cumsum(), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.show()

#### K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_scaled)
students_df['Cluster'] = kmeans.labels_
sns.scatterplot(x=principalComponents[:, 0], y=principalComponents[:, 1], hue=students_df['Cluster'], palette='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-means Clustering')
plt.show()