In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import os

In [4]:
def get_file_path():
    file_path = "data\GPA\AY24prjGPA.csv"
    if os.path.isfile(file_path):
        return file_path
    else:
        print(f"Error: File not found at '{file_path}'.")
        exit(1)

# Create directory structure
base_dir = 'gpa_analysis'
image_dir = os.path.join(base_dir, 'images')
os.makedirs(image_dir, exist_ok=True)

# Function to save plots
def save_plot(fig, filename):
    fig.savefig(os.path.join(image_dir, filename))
    plt.close(fig)

# Get file path
file_path = get_file_path()

# Load the data
try:
    df = pd.read_csv(file_path)
except Exception as e:
    print(f"Error reading the file: {e}")
    print("Please make sure the file is a valid CSV and you have permission to read it.")
    exit(1)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Remove rows with missing values
df_clean = df.dropna()
print(f"Original dataset shape: {df.shape}")
print(f"Clean dataset shape: {df_clean.shape}")

# Descriptive Analytics
desc_stats = df_clean.describe()
print(desc_stats)
desc_stats.to_csv(os.path.join(base_dir, 'descriptive_statistics.csv'))

# Calculate average GPA
avg_gpa = df_clean['GPA'].mean()
print(f"Average GPA: {avg_gpa:.2f}")

# Diagnostic Analytics

# GPA Distribution
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(df_clean['GPA'], kde=True, ax=ax)
ax.set_title('Distribution of GPAs')
ax.set_xlabel('GPA')
ax.set_ylabel('Count')
save_plot(fig, 'gpa_distribution.png')

# GPA by Team
fig, ax = plt.subplots(figsize=(12, 6))
sns.boxplot(x='Team', y='GPA', data=df_clean, ax=ax)
ax.set_title('GPA Distribution by Team')
save_plot(fig, 'gpa_by_team.png')

# Correlation between Number of Grades and GPA
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(x='# Grds', y='GPA', data=df_clean, ax=ax)
ax.set_title('Correlation between Number of Grades and GPA')
ax.set_xlabel('Number of Grades')
ax.set_ylabel('GPA')
save_plot(fig, 'grades_vs_gpa.png')

# Correlation heatmap
corr = df[['GPA', '# Grds', 'Crd Hrs']].corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
ax.set_title('Correlation Heatmap')
save_plot(fig, 'correlation_heatmap.png')

# Predictive Analytics

# Prepare data for prediction
X = df_clean[['# Grds', 'Crd Hrs']]
y = df_clean['GPA']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Train a linear regression model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Plot actual vs predicted GPA
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(y_test, y_pred)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax.set_xlabel('Actual GPA')
ax.set_ylabel('Predicted GPA')
ax.set_title('Actual vs Predicted GPA')
save_plot(fig, 'actual_vs_predicted_gpa.png')

# Save model results
with open(os.path.join(base_dir, 'model_results.txt'), 'w') as f:
    f.write(f"Mean Squared Error: {mse:.4f}\n")
    f.write(f"R-squared Score: {r2:.4f}\n")

print("Analysis complete. All plots have been saved in the 'gpa_analysis/images' directory.")
print("Descriptive statistics and model results have been saved in the 'gpa_analysis' directory.")

  file_path = "data\GPA\AY24prjGPA.csv"


Missing values in each column:
ID                0
LastName          0
FirstName         0
MiddleName      247
Team              0
StaffGroup        0
Cohort/Compo      0
GPA               1
# Grds            1
Crd Hrs           1
dtype: int64
Original dataset shape: (1060, 10)
Clean dataset shape: (812, 10)
                ID        Team         GPA      # Grds     Crd Hrs
count   812.000000  812.000000  812.000000  812.000000  812.000000
mean    501.045567   10.557882    3.868879   22.176108  739.905813
std     299.304577    5.499947    0.176265    2.147409   53.093749
min       1.000000    1.000000    3.270000    4.000000  150.000000
25%     241.750000    6.000000    3.750000   21.000000  730.040000
50%     486.500000   11.000000    3.880000   23.000000  754.040000
75%     758.500000   15.000000    4.000000   24.000000  754.040000
max    1057.000000   21.000000    4.280000   27.000000  922.040000
Average GPA: 3.87
Mean Squared Error: 0.0342
R-squared Score: -0.0049
Analysis complete