# CS Students Performance Data Analysis
## Comprehensive Exploratory Data Analysis and Insights

## Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

ModuleNotFoundError: No module named 'seaborn'

## Download and Load Data

First, you'll need to:
1. Install Kaggle CLI: `pip install kaggle`
2. Download your Kaggle API token from https://www.kaggle.com/account
3. Place it at ~/.kaggle/kaggle.json

Or manually download the CSV from the Kaggle link and place it in the working directory.

In [None]:
import os
from pathlib import Path

# Check if file exists locally
csv_file = 'cs_students.csv'
data_path = Path(csv_file)

if not data_path.exists():
    print("Attempting to download from Kaggle...")
    try:
        import subprocess
        # Download using Kaggle CLI
        subprocess.run(['kaggle', 'datasets', 'download', '-d', 'zahranusratt/cs-students-performance-dataset', 
                       '-p', '.', '--unzip'], check=True)
        print("Dataset downloaded successfully!")
    except Exception as e:
        print(f"Error downloading: {e}")
        print("Please download manually from: https://www.kaggle.com/datasets/zahranusratt/cs-students-performance-dataset")
else:
    print(f"Found {csv_file}")

In [None]:
# Load the data
df = pd.read_csv('cs_students.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## Data Overview

In [None]:
# Display basic information
print("Dataset Information:")
print(f"Total Records: {len(df)}")
print(f"Total Features: {len(df.columns)}")
print(f"\nColumn Names and Types:")
print(df.dtypes)

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe(include='all')

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

if missing.sum() == 0:
    print("No missing values found!")

In [None]:
# Display all columns
print("All Columns:")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

## Exploratory Data Analysis (EDA)

### Numerical Features Analysis

In [None]:
# Get numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {numerical_cols}")

# Distribution of numerical features
n_cols = len(numerical_cols)
fig, axes = plt.subplots((n_cols + 1) // 2, 2, figsize=(14, 4 * ((n_cols + 1) // 2)))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(axis='y', alpha=0.3)

# Remove extra subplots
for idx in range(n_cols, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()
print(f"\nDisplayed distributions for {n_cols} numerical features")

In [None]:
# Box plots for numerical features
fig, axes = plt.subplots((n_cols + 1) // 2, 2, figsize=(14, 4 * ((n_cols + 1) // 2)))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    axes[idx].boxplot(df[col], vert=True)
    axes[idx].set_title(f'Boxplot of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(axis='y', alpha=0.3)

for idx in range(n_cols, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()
print("Box plots displayed")

### Categorical Features Analysis

In [None]:
# Get categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

if categorical_cols:
    for col in categorical_cols:
        print(f"\n{col}:")
        print(df[col].value_counts())

In [None]:
# Visualize categorical features
if categorical_cols:
    fig, axes = plt.subplots((len(categorical_cols) + 1) // 2, 2, 
                             figsize=(14, 4 * ((len(categorical_cols) + 1) // 2)))
    axes = axes.flatten()
    
    for idx, col in enumerate(categorical_cols):
        df[col].value_counts().plot(kind='bar', ax=axes[idx], color='coral', edgecolor='black')
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Count')
        axes[idx].tick_params(axis='x', rotation=45)
        axes[idx].grid(axis='y', alpha=0.3)
    
    for idx in range(len(categorical_cols), len(axes)):
        fig.delaxes(axes[idx])
    
    plt.tight_layout()
    plt.show()

### Correlation Analysis

In [None]:
if len(numerical_cols) > 1:
    # Correlation matrix
    corr_matrix = df[numerical_cols].corr()
    
    # Heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix of Numerical Features', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\nTop Correlations (excluding 1.0):")
    # Get top correlations
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))
    
    corr_pairs_sorted = sorted(corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:10]
    for col1, col2, corr in corr_pairs_sorted:
        print(f"{col1} <-> {col2}: {corr:.3f}")

### Performance Distribution Analysis

In [None]:
# If there's a performance or grade column, analyze it
print("Looking for performance-related columns...")
performance_cols = [col for col in df.columns if any(keyword in col.lower() 
                    for keyword in ['grade', 'score', 'gpa', 'performance', 'mark', 'final', 'result'])]

if performance_cols:
    print(f"Found performance columns: {performance_cols}")
    
    fig, axes = plt.subplots(1, len(performance_cols), figsize=(5*len(performance_cols), 5))
    if len(performance_cols) == 1:
        axes = [axes]
    
    for idx, col in enumerate(performance_cols):
        axes[idx].hist(df[col], bins=30, color='green', alpha=0.7, edgecolor='black')
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
        axes[idx].axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
        axes[idx].axvline(df[col].median(), color='blue', linestyle='--', label=f'Median: {df[col].median():.2f}')
        axes[idx].legend()
        axes[idx].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No specific performance columns found. Check data structure.")

### Relationships Between Variables

In [None]:
# Scatter plot matrix for numerical features (if not too many)
if len(numerical_cols) <= 6:
    pd.plotting.scatter_matrix(df[numerical_cols], figsize=(12, 10), 
                               diagonal='hist', alpha=0.7)
    plt.suptitle('Scatter Matrix of Numerical Features', fontsize=14, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.show()
else:
    print(f"Too many numerical features ({len(numerical_cols)}) for scatter matrix. Creating subset...")
    # Select first 6 most important numerical columns
    subset_cols = numerical_cols[:6]
    pd.plotting.scatter_matrix(df[subset_cols], figsize=(12, 10), 
                               diagonal='hist', alpha=0.7)
    plt.suptitle('Scatter Matrix (First 6 Numerical Features)', fontsize=14, fontweight='bold', y=1.00)
    plt.tight_layout()
    plt.show()

## Statistical Insights

In [None]:
# Calculate skewness and kurtosis for numerical features
print("Skewness and Kurtosis Analysis:")
print("="*60)

stats_df = pd.DataFrame({
    'Mean': df[numerical_cols].mean(),
    'Median': df[numerical_cols].median(),
    'Std Dev': df[numerical_cols].std(),
    'Skewness': df[numerical_cols].skew(),
    'Kurtosis': df[numerical_cols].kurtosis()
})

print(stats_df.round(3))

In [None]:
# Outlier detection using IQR method
print("\nOutlier Detection (IQR Method):")
print("="*60)

outlier_summary = {}
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_summary[col] = {
        'Count': len(outliers),
        'Percentage': (len(outliers) / len(df)) * 100
    }

outlier_df = pd.DataFrame(outlier_summary).T
outlier_df = outlier_df[outlier_df['Count'] > 0].sort_values('Count', ascending=False)

if len(outlier_df) > 0:
    print(outlier_df)
else:
    print("No significant outliers detected")

## Key Findings Summary

In [None]:
print("\n" + "="*70)
print("DATA ANALYSIS SUMMARY")
print("="*70)

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total Records: {len(df):,}")
print(f"   - Total Features: {len(df.columns)}")
print(f"   - Numerical Features: {len(numerical_cols)}")
print(f"   - Categorical Features: {len(categorical_cols)}")
print(f"   - Missing Values: {'None' if df.isnull().sum().sum() == 0 else df.isnull().sum().sum()}")

print(f"\n2. NUMERICAL FEATURES SUMMARY:")
for col in numerical_cols:
    print(f"   - {col}: Mean={df[col].mean():.2f}, Std={df[col].std():.2f}, "
          f"Range=[{df[col].min():.2f}, {df[col].max():.2f}]")

if categorical_cols:
    print(f"\n3. CATEGORICAL FEATURES:")
    for col in categorical_cols:
        print(f"   - {col}: {df[col].nunique()} unique values")
        print(f"     Top 3: {dict(df[col].value_counts().head(3))}")

print(f"\n" + "="*70)

## Recommendations for Further Analysis

In [None]:
print("""
Suggestions for deeper analysis:

1. PREDICTIVE MODELING:
   - Build a regression model to predict student performance
   - Use features like study hours, GPA, etc. as predictors

2. CLUSTERING:
   - Identify student groups with similar characteristics
   - Use K-means or hierarchical clustering

3. STATISTICAL TESTS:
   - Perform ANOVA to compare performance across groups
   - Chi-square tests for categorical relationships

4. FEATURE ENGINEERING:
   - Create derived features (e.g., study intensity ratios)
   - Normalize features for modeling

5. TIME SERIES ANALYSIS:
   - If temporal data exists, analyze performance trends

6. SEGMENTATION:
   - Identify high/low performers
   - Analyze characteristics of top students
""")