In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

In [18]:
# Loading the dataset
Academic_Success_df = pd.read_csv(r'C:/Intro_DataScience/Week4/Academic_Success_Data.csv')

In [19]:
Academic_Success_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   "Daytime/evening attendance	"                   4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [20]:
Academic_Success_df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,"""Daytime/evening attendance\t""",Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [21]:
Academic_Success_df.describe()

Unnamed: 0,Marital status,Application mode,Application order,Course,"""Daytime/evening attendance\t""",Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,1.178571,18.669078,1.727848,8856.642631,0.890823,4.577758,132.613314,1.873192,19.561935,22.275316,...,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,11.566139,1.228029,0.001969
std,0.605747,17.484682,1.313793,2063.566416,0.311897,10.216592,13.188332,6.914514,15.603186,15.343108,...,0.69088,1.918546,2.195951,3.947951,3.014764,5.210808,0.753774,2.66385,1.382711,2.269935
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,1.0,1.0,9085.0,1.0,1.0,125.0,1.0,2.0,3.0,...,0.0,0.0,5.0,6.0,2.0,10.75,0.0,9.4,0.3,-1.7
50%,1.0,17.0,1.0,9238.0,1.0,1.0,133.1,1.0,19.0,19.0,...,0.0,0.0,6.0,8.0,5.0,12.2,0.0,11.1,1.4,0.32
75%,1.0,39.0,2.0,9556.0,1.0,1.0,140.0,1.0,37.0,37.0,...,0.0,0.0,7.0,10.0,6.0,13.333333,0.0,13.9,2.6,1.79
max,6.0,57.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,12.0,19.0,23.0,33.0,20.0,18.571429,12.0,16.2,3.7,3.51


In [6]:
# Loading and preprocessing the data
def preprocess_data(df):
    # Remove rows where Target is 'Enrolled'
    df = df[df['Target'] != 'Enrolled']
    
    # Convert Target to binary (Dropout = 1, Graduate = 0)
    df['Target'] = (df['Target'] == 'Dropout').astype(int)
    
    # Select numerical columns
    numerical_cols = ['Marital status', 'Application order', 'Previous qualification (grade)', 
                     'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (credited)', 
                     'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 
                     'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 
                     'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 
                     'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 
                     'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 
                     'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 
                     'Inflation rate', 'GDP']
    
    return df, numerical_cols


In [7]:
# Function to find optimal threshold
def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 0.9, 0.05)
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)
        current_f1 = f1_score(y_true, y_pred)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = threshold
    
    return best_threshold

In [22]:
# Function to evaluate model
def evaluate_model(y_true, y_pred, name):
    print(f"\nResults for {name}")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Score (Dropout): {f1_score(y_true, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['Graduate', 'Dropout']))

In [23]:
df, numerical_cols = preprocess_data(Academic_Success_df)

In [24]:
# Prepare features and target
X = Academic_Success_df[numerical_cols]
y = Academic_Success_df['Target']

In [25]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [26]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
# Initialize models with class weights
models = {
    'Logistic Regression': LogisticRegression(class_weight=class_weights, max_iter=1000),
    'Random Forest': RandomForestClassifier(class_weight=class_weights, n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(scale_pos_weight=class_weights[1]/class_weights[0], 
                            eval_metric='logloss', random_state=42)
}

In [28]:
# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Get probabilities and optimize threshold
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        threshold = find_optimal_threshold(y_test, y_pred_proba)
        y_pred = (y_pred_proba >= threshold).astype(int)
        print(f"Optimal threshold: {threshold:.2f}")
    else:
        y_pred = model.predict(X_test_scaled)
    
    # Evaluate model
    evaluate_model(y_test, y_pred, name)
    
    # Feature importance for Random Forest and XGBoost
    if hasattr(model, 'feature_importances_'):
        importances = pd.DataFrame({
            'feature': numerical_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        print(f"\nTop 10 Important Features for {name}:")
        print(importances.head(10))


Training Logistic Regression...


ValueError: The classes, ['Dropout', 'Enrolled', 'Graduate'], are not in class_weight

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Loading and preprocessing the data
def preprocess_data(df):
    # Convert Target to binary (Dropout = 1, Graduate = 0)
    # First remove 'Enrolled' class
    df = df[df['Target'].isin(['Dropout', 'Graduate'])]
    # Then convert to binary
    df['Target'] = (df['Target'] == 'Dropout').astype(int)
    
    # Select numerical columns
    numerical_cols = ['Marital status', 'Application order', 'Previous qualification (grade)', 
                     'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (credited)', 
                     'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 
                     'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 
                     'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 
                     'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 
                     'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 
                     'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 
                     'Inflation rate', 'GDP']
    
    return df, numerical_cols

# Function to find optimal threshold
def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.arange(0.1, 0.9, 0.05)
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)
        current_f1 = f1_score(y_true, y_pred)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = threshold
    
    return best_threshold, best_f1

# Function to evaluate model
def evaluate_model(y_true, y_pred, model_name, threshold=None):
    print(f"\nResults for {model_name}")
    if threshold is not None:
        print(f"Optimal threshold: {threshold:.3f}")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"F1 Score (Dropout): {f1_score(y_true, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=['Graduate', 'Dropout']))

# Load and preprocess the data
print("Loading and preprocessing data...")
Academic_Success_df = pd.read_csv(r'C:/Intro_DataScience/Week4/Academic_Success_Data.csv')
df, numerical_cols = preprocess_data(Academic_Success_df)

# Prepare features and target
X = df[numerical_cols]
y = df['Target']

# Print class distribution
print("\nClass distribution:")
print(df['Target'].value_counts(normalize=True))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate class weights
class_counts = np.bincount(y_train)
total_samples = len(y_train)
class_weights = {
    0: total_samples / (2 * class_counts[0]),  # weight for Graduate
    1: total_samples / (2 * class_counts[1])   # weight for Dropout
}

print("\nClass weights:")
print(class_weights)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(
        class_weight=class_weights,
        max_iter=1000,
        random_state=42
    ),
    'Random Forest': RandomForestClassifier(
        class_weight=class_weights,
        n_estimators=100,
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        scale_pos_weight=class_weights[1]/class_weights[0],
        eval_metric='logloss',
        random_state=42
    )
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Get predictions
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        threshold, best_f1 = find_optimal_threshold(y_test, y_pred_proba)
        y_pred = (y_pred_proba >= threshold).astype(int)
        evaluate_model(y_test, y_pred, name, threshold)
    else:
        y_pred = model.predict(X_test_scaled)
        evaluate_model(y_test, y_pred, name)
    
    # Feature importance for Random Forest and XGBoost
    if hasattr(model, 'feature_importances_'):
        importances = pd.DataFrame({
            'feature': numerical_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        print(f"\nTop 10 Important Features for {name}:")
        print(importances.head(10))

# Print overall summary
print("\nOverall Summary:")
for name in models.keys():
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: F1={f1:.4f}, Accuracy={acc:.4f}")

Loading and preprocessing data...

Class distribution:
Target
0    0.60854
1    0.39146
Name: proportion, dtype: float64

Class weights:
{0: 0.8217981888745148, 1: 1.2768844221105529}

Training Logistic Regression...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Target'] = (df['Target'] == 'Dropout').astype(int)



Results for Logistic Regression
Optimal threshold: 0.650
Accuracy: 0.9137
F1 Score (Dropout): 0.8848

Confusion Matrix:
[[634  29]
 [ 65 361]]

Classification Report:
              precision    recall  f1-score   support

    Graduate       0.91      0.96      0.93       663
     Dropout       0.93      0.85      0.88       426

    accuracy                           0.91      1089
   macro avg       0.92      0.90      0.91      1089
weighted avg       0.91      0.91      0.91      1089


Training Random Forest...

Results for Random Forest
Optimal threshold: 0.350
Accuracy: 0.8953
F1 Score (Dropout): 0.8705

Confusion Matrix:
[[592  71]
 [ 43 383]]

Classification Report:
              precision    recall  f1-score   support

    Graduate       0.93      0.89      0.91       663
     Dropout       0.84      0.90      0.87       426

    accuracy                           0.90      1089
   macro avg       0.89      0.90      0.89      1089
weighted avg       0.90      0.90      0.90 