<a href="https://colab.research.google.com/github/MeghanaVandana/Advanced_Mathematical_Statistics/blob/main/Project4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# âœ… FIXED: Use RAW GitHub Excel file path
file_path = "https://raw.githubusercontent.com/MeghanaVandana/Advanced_Mathematical_Statistics/main/Project4/Preliminary%20college%20year.xlsx"

# Read the Excel file properly
df = pd.read_excel(file_path, engine='openpyxl')

# Step 2: Initial data cleaning
df.dropna(subset=['Retained F17-F18? (1=yes, 0=no)'], inplace=True)

# Remove unnecessary columns
columns_to_remove = [
    'Federal Ethnic Group',
    'Gender',
    'Reason for not Completing Connect',
    'Reason not Retained'
]
df.drop(columns=columns_to_remove, axis=1, inplace=True)

# Step 3: Define X and y
X = df.drop(columns=['Retained F17-F18? (1=yes, 0=no)'])
y = df['Retained F17-F18? (1=yes, 0=no)']

# Step 4: One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Step 5: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 6: Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Step 8: GridSearchCV for logistic regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

logistic_model = LogisticRegression(max_iter=1000, random_state=42)
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Step 9: Predictions
y_pred = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]

# Step 10: Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("Model Performance Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC AUC Score:", roc_auc)


Model Performance Metrics:
Accuracy: 0.9545454545454546
Precision: 0.9411764705882353
Recall: 1.0
ROC AUC Score: 1.0
