In [1]:
# Basic data handling and operations
import numpy as np
import pandas as pd

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning - model selection, evaluation, preprocessing, etc.
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline

# Machine learning algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

# ARIMA
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf

# Path to dataset
enrollment_path = '../data/enrollment_training_data.xlsx'

# Load the data into Pandas DataFrames
enrollment_data = pd.read_excel(enrollment_path)

# Drop the unnecessary columns before splitting the data
enrollment_data.drop(columns=['DISTRICT_NAME', 'SCHOOL_NAME'], inplace=True)

# Display the first few rows of the DataFrames to verify
enrollment_data.head(10)

Unnamed: 0,YEAR,COUNTY_DISTRICT_CODE,SCHOOL_CODE,GRADE,TOTAL_ENROLLMENT,COHORT_SURVIVAL_RATE,RESIDENT_ESTIMATE,ENROLLMENT_RESIDENT_PROPORTION
0,2010,39141,1050,6,41,1.0,29567.0,0.001387
1,2010,39141,1050,7,43,1.04878,29567.0,0.001454
2,2010,39141,1050,8,40,1.025641,29567.0,0.001353
3,2010,39141,1050,9,472,1.145631,29567.0,0.015964
4,2010,39141,1050,10,356,0.887781,29567.0,0.01204
5,2010,39141,1050,11,380,1.24183,29567.0,0.012852
6,2010,39141,1050,12,295,1.003401,29567.0,0.009977
7,2010,39141,1050,0-8,124,1.024793,29567.0,0.004194
8,2010,39141,1050,9-12,1503,1.063694,29567.0,0.050834
9,2010,39141,1050,0-12,1627,1.060626,29567.0,0.055028


In [2]:
# Define the mapping for grade spans
grade_span_mapping = {'0-8': 13, '9-12': 14, '0-12': 15}

# Replace grade spans with the defined mapping
enrollment_data['GRADE'] = enrollment_data['GRADE'].replace(grade_span_mapping)

# Check the first few rows to ensure the replacement was successful
enrollment_data.head(25)

Unnamed: 0,YEAR,COUNTY_DISTRICT_CODE,SCHOOL_CODE,GRADE,TOTAL_ENROLLMENT,COHORT_SURVIVAL_RATE,RESIDENT_ESTIMATE,ENROLLMENT_RESIDENT_PROPORTION
0,2010,39141,1050,6,41,1.0,29567.0,0.001387
1,2010,39141,1050,7,43,1.04878,29567.0,0.001454
2,2010,39141,1050,8,40,1.025641,29567.0,0.001353
3,2010,39141,1050,9,472,1.145631,29567.0,0.015964
4,2010,39141,1050,10,356,0.887781,29567.0,0.01204
5,2010,39141,1050,11,380,1.24183,29567.0,0.012852
6,2010,39141,1050,12,295,1.003401,29567.0,0.009977
7,2010,39141,1050,13,124,1.024793,29567.0,0.004194
8,2010,39141,1050,14,1503,1.063694,29567.0,0.050834
9,2010,39141,1050,15,1627,1.060626,29567.0,0.055028


In [6]:
# Replace NaN values with 0 in the Cohort_Survival_Rate column using recommended approach
enrollment_data['COHORT_SURVIVAL_RATE'] = enrollment_data['COHORT_SURVIVAL_RATE'].fillna(0)

# Check for NaN values in the entire DataFrame
nan_values = enrollment_data.isna().sum()
print("Number of NaN values in each column:")
print(nan_values)

# Check for blank values in the entire DataFrame
# Replace '' with whatever you consider a 'blank' value, such as ' ' for spaces.
blank_values = (enrollment_data == '').sum()
print("\nNumber of blank values in each column:")
print(blank_values)


Number of NaN values in each column:
YEAR                                0
COUNTY_DISTRICT_CODE                0
SCHOOL_CODE                         0
GRADE                               0
TOTAL_ENROLLMENT                    0
COHORT_SURVIVAL_RATE                0
RESIDENT_ESTIMATE                 352
ENROLLMENT_RESIDENT_PROPORTION    352
dtype: int64

Number of blank values in each column:
YEAR                              0
COUNTY_DISTRICT_CODE              0
SCHOOL_CODE                       0
GRADE                             0
TOTAL_ENROLLMENT                  0
COHORT_SURVIVAL_RATE              0
RESIDENT_ESTIMATE                 0
ENROLLMENT_RESIDENT_PROPORTION    0
dtype: int64


In [7]:
# Define the new cutoff year for splitting the data
cutoff_year = 2021

# Split the data based on the new cutoff year
train_data = enrollment_data[enrollment_data['YEAR'] <= cutoff_year]
test_data = enrollment_data[enrollment_data['YEAR'] == 2022]  # Use 2022 as the test set

# Separate features and target variable for training and testing
X_train = train_data.drop(['TOTAL_ENROLLMENT'], axis=1)
y_train = train_data['TOTAL_ENROLLMENT']
X_test = test_data.drop(['TOTAL_ENROLLMENT'], axis=1)
y_test = test_data['TOTAL_ENROLLMENT']


In [8]:
# 1. Select a Machine Learning Algorithm
from sklearn.linear_model import LinearRegression

# 2. Train the Model
model = LinearRegression()
model.fit(X_train, y_train)

# 3. Evaluate the Model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)


Mean Squared Error: 649.6860342375368
