<a href="https://colab.research.google.com/github/Harshithmusaram/MTH522/blob/main/MTH4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries for data processing and modeling
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Step 1: Load the dataset
# Replace 'your_file_path' with the path where your file is located
data_path = '/content/sample_data/Preliminary college year.csv'
df = pd.read_csv(data_path)

# Step 2: Initial data cleaning
# Drop rows where the target variable (outcome) is missing
df.dropna(subset=['Retained F17-F18? (1=yes, 0=no)'], inplace=True)

# Remove unnecessary columns that do not contribute to prediction
# These columns might have demographic or irrelevant details
columns_to_remove = ['Federal Ethnic Group', 'Gender', 'Reason for not Completing Connect', 'Reason not Retained']
df.drop(columns=columns_to_remove, axis=1, inplace=True)

# Step 3: Define features (X) and target (y)
# X will be all columns except the outcome, and y will be the outcome column
X = df.drop(columns=['Retained F17-F18? (1=yes, 0=no)'])
y = df['Retained F17-F18? (1=yes, 0=no)']

# Step 4: Convert categorical data to numerical format using One-Hot Encoding
# This step ensures all variables are in numeric form, suitable for logistic regression
X = pd.get_dummies(X, drop_first=True)

# Step 5: Handle any remaining missing values in the predictor data
# Fill missing values in predictors with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 6: Feature Scaling
# Standardize features to improve model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Step 7: Split the data into training and testing sets
# The data is split to allow model evaluation on unseen data (20% test set)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 8: Define and tune the logistic regression model
# Using GridSearchCV to find the best model parameters (C value and penalty type)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization strengths
    'penalty': ['l1', 'l2'],         # Regularization types (only 'l1' and 'l2' are valid here)
    'solver': ['liblinear']           # Solver 'liblinear' supports both 'l1' and 'l2' penalties
}

# Initialize the logistic regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Perform grid search for best hyperparameters
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 9: Evaluate the model using the test data
# Use the best model obtained from grid search for predictions
best_model = grid_search.best_estimator_

# Predict class labels on the test data
y_pred = best_model.predict(X_test)

# Predict probabilities for ROC-AUC score calculation
y_pred_prob = best_model.predict_proba(X_test)[:, 1]

# Step 10: Calculate and display key performance metrics
accuracy = accuracy_score(y_test, y_pred)  # Overall accuracy of predictions
precision = precision_score(y_test, y_pred)  # Precision of positive predictions
recall = recall_score(y_test, y_pred)        # Recall of true positives
roc_auc = roc_auc_score(y_test, y_pred_prob) # ROC-AUC score for evaluating model's classification ability

# Display all metrics
print(f"Model Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Model Performance Metrics:
Accuracy: 0.9545
Precision: 0.9412
Recall: 1.0000
ROC AUC Score: 1.0000


In [None]:
from google.colab import drive
drive.mount('/content/drive')