In [34]:
import pandas as pd
import numpy as np
import os
import time
from itertools import cycle
from rich import print
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.style import Style
from rich.panel import Panel
from rich.live import Live
from dataset_visualiser import visualize_dataset


In [35]:
df = pd.read_csv("final_dataset_corrupt.csv")
df.head()

Unnamed: 0,University ID,University Name,Program Type,Location,GRE Score,TOEFL Score,GPA,Ranking,Placement Rate (%),Tuition Fee (USD/year),...,Student Last Name,Student Email,Phone,Date of Birth,Funding Source,Funding Amount,Funding Year,Program Name,Program Duration (Years),Application Deadline
0,1,"University of California, Berkeley - Undergrad...",Undergraduate,Nebraska,298.0,110.0,2.72,97054,98.62,68075,...,Jones,bradley.jones0@example.edu,(203)-744-2631,14-12-1989,"Martin, Silva and Reid",194621,2022,Animal technologist Program,5,29-08-2025
1,2,Princeton University - Diploma School,Diploma,Maryland,328.0,118.0,3.8,24489,65.59,47834,...,Mcfarland,kendra.mcfarland1@example.edu,(655)-615-0763,21-02-2004,Wells-Warren,158259,2025,International aid/development worker Program,3,17-09-2025
2,3,Yale University - MBA School,MBA,Oklahoma,331.0,86.0,2.71,4813,95.95,76884,...,Harding,steven.harding2@example.edu,(308)-709-8479,20-08-2000,Hernandez-Roberson,490781,2025,Museum/gallery curator Program,3,01-10-2025
3,4,Columbia University - Diploma School,Diploma,Oklahoma,316.0,113.0,2.52,35276,68.18,77522,...,Love,brianna.love3@example.edu,(483)-926-2897,30-01-1997,Gibson and Sons,655164,2015,Dancer Program,2,29-09-2025
4,5,California Institute of Technology - MBA School,MBA,North Carolina,302.0,95.0,3.46,86811,93.6,79694,...,Jones,kayla.jones4@example.edu,(427)-897-0089,09-04-2001,Hernandez Ltd,781768,2020,"Sales professional, IT Program",5,10-05-2025


In [36]:
# Set the file path to your dataset
file_path = visualize_dataset("final_dataset_corrupt.csv")


Output()

In [61]:
 # Get column names
print("\nColumn names:")
column_names = df.columns
print(column_names)

In [38]:
# Check missing values in a nicer format
print("\nMissing values summary:")
missing_values = df.isnull().sum().reset_index()
missing_values.columns = ['Column', 'Missing Values']
missing_values['Missing Percentage'] = (missing_values['Missing Values'] / len(df)) * 100
missing_values = missing_values.sort_values('Missing Values', ascending=False)
display(missing_values)

Unnamed: 0,Column,Missing Values,Missing Percentage
13,Student Email,3030,3.0
6,GPA,3030,3.0
5,TOEFL Score,3008,2.978218
4,GRE Score,2995,2.965347
14,Phone,2993,2.963366
0,University ID,0,0.0
20,Program Duration (Years),0,0.0
19,Program Name,0,0.0
18,Funding Year,0,0.0
17,Funding Amount,0,0.0


In [39]:
 # Get data types
print("\nData types:")
print(df.dtypes)

In [40]:
 # Get basic statistics for numerical columns
print("\nSummary statistics for numerical columns:")
display(df.describe())

Unnamed: 0,University ID,TOEFL Score,GPA,Ranking,Placement Rate (%),Tuition Fee (USD/year),Average Pay (USD/year),Funding Amount,Funding Year,Program Duration (Years)
count,101000.0,97992.0,97970.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0,101000.0
mean,50500.5,98.954068,2.997546,50500.5,80.011989,52494.002238,124908.369228,504473.16798,2020.000703,2.998663
std,29156.332931,15.571199,0.577345,29156.332931,11.542622,21699.635339,43250.654796,286482.632235,3.166253,1.416693
min,1.0,-1.0,2.0,1.0,60.0,15000.0,50001.0,10001.0,2015.0,1.0
25%,25250.75,89.0,2.5,25250.75,70.05,33610.0,87499.75,255296.0,2017.0,2.0
50%,50500.5,100.0,3.0,50500.5,80.0,52521.0,124916.5,504172.0,2020.0,3.0
75%,75750.25,110.0,3.5,75750.25,90.0,71374.25,162326.25,753791.75,2023.0,4.0
max,101000.0,120.0,4.0,101000.0,100.0,90000.0,199997.0,999959.0,2025.0,5.0


In [41]:
# Get value counts for categorical columns (first few)
print("\nSample value counts for categorical columns:")
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns[:3]:  # Show first 3 categorical columns
    print(f"\nValue counts for {col}:")
    display(df[col].value_counts().head())


University Name
Stanford University - PhD School                       2158
Massachusetts Institute of Technology - MS School      2128
Princeton University - Undergraduate School            2114
University of Pennsylvania - Diploma School            2090
University of California, Berkeley - Diploma School    2085
Name: count, dtype: int64

Program Type
MS               20341
Diploma          20276
PhD              20246
MBA              20073
Undergraduate    20064
Name: count, dtype: int64

Location
Connecticut    6155
New Mexico     6151
Kentucky       5986
Oklahoma       5979
Indiana        4096
Name: count, dtype: int64

# DATA PREPROCESSING


## Step 2: Data Preprocessing - 
### 1. Handle missing values
### 2. Check data types and convert if necessary
### 3. Feature normalization (for numerical features)
### 4. Encode categorical variables

In [42]:
# Step 2: Data Preprocessing

# 2.1 Handle missing values
print("Handling missing values...")

# For numerical columns, fill missing values with median (more robust than mean for potentially skewed data)
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
        print(f"Filled missing values in {col} with median: {median_value}")

# For categorical columns, fill missing values with mode (most frequent value)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)
        print(f"Filled missing values in {col} with mode: {mode_value}")





In [43]:
# 2.2 Check data types and convert if necessary
print("\nChecking and converting data types...")

# Convert 'GRE Score' to numeric if it's not already
if 'GRE Score' in df.columns and df['GRE Score'].dtype == 'object':
    df['GRE Score'] = pd.to_numeric(df['GRE Score'], errors='coerce')
    # Fill newly created NaNs (from conversion errors)
    df['GRE Score'] = df['GRE Score'].fillna(df['GRE Score'].median())
    print("Converted 'GRE Score' to numeric")

# Convert 'Date of Birth' to datetime
if 'Date of Birth' in df.columns:
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')
    print("Converted 'Date of Birth' to datetime")

  df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')


In [44]:
# 2.3 Feature normalization (for numerical features)
print("\nNormalizing numerical features...")

# Select columns to normalize (exclude ID columns and dates)
cols_to_normalize = ['GRE Score', 'TOEFL Score', 'GPA', 'Ranking', 
                    'Placement Rate (%)', 'Tuition Fee (USD/year)', 
                    'Average Pay (USD/year)']

# Check if columns exist
cols_to_normalize = [col for col in cols_to_normalize if col in df.columns]

# Apply Min-Max scaling
for col in cols_to_normalize:
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)
    print(f"Normalized {col}")


In [45]:
# 2.4 Encode categorical variables (memory-efficient approach)
print("\nEncoding categorical variables (memory-efficient approach)...")

# Instead of one-hot encoding all categorical variables which can cause memory issues,
# let's use label encoding for high-cardinality columns and be selective about one-hot encoding

# Choose specific columns for one-hot encoding (with fewer unique values)
# Count unique values in each categorical column
print("\nUnique values in categorical columns:")
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")

# Choose only categorical columns with fewer unique values for one-hot encoding
categorical_cols_to_encode = []
for col in categorical_cols:
    if df[col].nunique() < 10 and col in ['Program Type', 'Location']:  # Limit to columns with <10 unique values
        categorical_cols_to_encode.append(col)

print(f"\nSelected for one-hot encoding: {categorical_cols_to_encode}")

# Process one column at a time to avoid memory issues
for col in categorical_cols_to_encode:
    print(f"One-hot encoding: {col}")
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
    
    # Add the dummy columns to the dataframe
    df = pd.concat([df, dummies], axis=1)
    
    # Drop the original column
    df = df.drop(col, axis=1)
    
    print(f"Added {dummies.shape[1]} dummy columns for {col}")

# For remaining categorical columns, use label encoding
from sklearn.preprocessing import LabelEncoder

# Identify remaining categorical columns
remaining_cat_cols = [col for col in categorical_cols if col not in categorical_cols_to_encode]
label_encoders = {}

for col in remaining_cat_cols:
    if col in ['University Name', 'Student First Name', 'Student Last Name', 'Student Email']:
        # Skip encoding identifier columns
        continue
        
    le = LabelEncoder()
    df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"Label encoded: {col}")

# Display the first few rows of the preprocessed dataset
print("\nPreprocessed dataset (first 5 rows):")
display(df.head(5))

Unnamed: 0,University ID,University Name,Location,GRE Score,TOEFL Score,GPA,Ranking,Placement Rate (%),Tuition Fee (USD/year),Average Pay (USD/year),...,Program Type_MS,Program Type_PhD,Program Type_Undergraduate,Location_encoded,GRE Score_encoded,Phone_encoded,Date of Birth_encoded,Funding Source_encoded,Program Name_encoded,Application Deadline_encoded
0,1,"University of California, Berkeley - Undergrad...",Nebraska,0.475,0.917355,0.36,0.96093,0.9655,0.707667,0.849236,...,False,False,True,17,38,448,252,24803,36,140
1,2,Princeton University - Diploma School,Maryland,0.85,0.983471,0.9,0.242458,0.13975,0.437787,0.165724,...,False,False,False,11,68,55400,5434,43248,326,81
2,3,Yale University - MBA School,Oklahoma,0.8875,0.719008,0.355,0.047644,0.89875,0.82512,0.577069,...,False,False,False,25,71,13199,4154,17388,390,4
3,4,Columbia University - Diploma School,Oklahoma,0.7,0.942149,0.26,0.349261,0.2045,0.833627,0.965619,...,False,False,False,25,56,34414,2856,13672,149,141
4,5,California Institute of Technology - MBA School,North Carolina,0.525,0.793388,0.73,0.859513,0.84,0.862587,0.001987,...,False,False,False,22,42,27573,4386,17160,522,42


In [46]:
# Save the preprocessed dataset
preprocessed_path = r"C:\Users\masood\Downloads\Machine Learning Project\preprocessed_dataset.csv"
df.to_csv(preprocessed_path, index=False)
print(f"\nPreprocessed dataset saved to: {preprocessed_path}")