We have more than 76.5 synthetic data points available for constructing a model. This model will predict whether individuals have graduated, dropped out, or are still enrolled, based on specific features.



In [None]:
# Import libraries

# data processing
import pandas as pd
import numpy as np

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical
from scipy.stats import chi2_contingency, f_oneway

# Model libraries
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Model evaluation
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay



# Loading the data
file_path_train = '/kaggle/input/playground-series-s4e6/train.csv'
file_path_test = '/kaggle/input/playground-series-s4e6/test.csv'


df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

# Dropping the first column
submission_id = df_test['id'].reset_index(drop=True)
df_train = df_train.drop(columns='id')
df_test = df_test.drop(columns='id')

# Shape of train and test data
print("train shape: ", df_train.shape)
print("test shape: ", df_test.shape)

In [None]:
df_train.head()

In [None]:
df_train.describe().T

In [None]:
df_train.info()

In [None]:
# Correcting the datatype of the categorial data
category_columns_train = [0,1,3,4,5,7,8,9,10,11,13,14,15,16,17,18,20,36]
for col in category_columns_train:
    df_train.iloc[:, col] = df_train.iloc[:, col].astype('str').astype('category')

category_columns_test = [0,1,3,4,5,7,8,9,10,11,13,14,15,16,17,18,20]
for col in category_columns_test:
    df_test.iloc[:, col] = df_test.iloc[:, col].astype('str').astype('category')

In [None]:
df_train.info()

Let's look at each variable individually to understand the distribution of data in each column.

In [None]:
def visualize_columns(df, columns):
    """
    Visualizes the minimum, maximum, histogram, box plot, and density plot for specified columns,
    each in a single row.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    columns (list of str): List of column names to visualize.
    """
    for column in columns:
        if column in df.columns:
            # Create a figure with 1 row and 4 columns of subplots
            fig, axs = plt.subplots(1, 4, figsize=(20, 4))  

            # Min and Max visualization
            min_val = df[column].min()
            max_val = df[column].max()
            mean_val = df[column].mean().round(4)
            median_val = df[column].median()
            mode_val = df[column].mode()[0]
            std_val = round(df[column].std(), 4)
            axs[0].text(0.5, 0.5, f'Min: {min_val}\nMax: {max_val}\nMean: {mean_val}\nMedian: {median_val}\nMode: {mode_val}\nSTD: {std_val}', horizontalalignment='center', 
                        verticalalignment='center', fontsize=20, transform=axs[0].transAxes)
            axs[0].set_title(f'Min/Max/Mean of {column}', fontsize=15)
            axs[0].axis('off')  # Hide axes

            # Histogram
            axs[1].hist(df[column], bins=30, color=plt.cm.viridis(0.9), edgecolor='black')
            axs[1].set_title(f'Histogram of {column}', fontsize=15)

            # Box plot
            sns.boxplot(x=df[column], ax=axs[2], palette='viridis')
            axs[0].set_title(f'Min/Max/Mean of {column}')
            axs[2].set_title(f'Box Plot of {column}', fontsize=15)

            # Density Plot
            sns.kdeplot(df[column], ax=axs[3], fill=True, color=plt.cm.viridis(0.1))
            axs[3].set_title(f'Density Plot of {column}', fontsize=15)

            plt.tight_layout()
            plt.show()
        else:
            print(f"Column '{column}' not found in DataFrame.")

visualize_columns(df_train, df_train.select_dtypes(include=['float64', 'int64']).columns)

Let's perform a spot check on a few of the features with unusual box plots to determine if additional data transformations are needed to properly interpret these features.

**Explore the relationship between categorical variable and target values.**

Let’s implement a Chi-Square test of independence as an example to check the association between each categorical feature and the target.


In [None]:
# Define features and labels
X = df_train.drop(columns='Target')
y = df_train['Target']

# Define categorical and numerical features
categorical_features= X.select_dtypes(include='category').columns
numerical_features= X.select_dtypes(include=['float64', 'int64']).columns

# Find p-values
p_values = {}

for col in categorical_features:
    contingency_table = pd.crosstab(df_train[col], y)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    p_values[col] = p

for col, p in p_values.items():
    print(f"P-value for {col} and target: {p}")

**Interpretation:**

Null hypothesis: There is no association between the two variables.

Low p-values (<0.05): Strong evidence against the null hypothesis, suggesting a significant association between the feature and the target.
High p-values: Weak evidence against the null hypothesis, no significant association found.

So the features with less than 0.05 p_value has statistically significant association with the target values. This means that these features likely provide some information about the target categories.

Marital Status

Application Mode

Course

Daytime/Evening Attendance (though this is not zero, it's a very small number, indicating significance)

Previous Qualification

Nacionality

Mother's Qualification

Father's Qualification

Mother's Occupation

Father's Occupation

Displaced

Debtor

Tuition Fees Up to Date

Gender

Scholarship Holder

Below features have high p-values (greater than 0.05), suggesting that there is no significant association with the target values under the common significance level. This implies that these features do not provide reliable information about the target categories:

Educational Special Needs (p-value: 0.8900215041644881)
International (p-value: 0.7257456838845804)
Lets calculate Cramér's V, which measures the strength of association between two categorical variables, we'll use the Chi-Square statistic derived from the contingency table of each feature with the target. Cramér's V provides a value between 0 and 1, where 0 indicates no association and 1 indicates a perfect association.

In [None]:
def cramers_v(chi2, n, k, r):
    return np.sqrt(chi2 / n / min(k - 1, r - 1))

# Dictionary to store Cramér's V values
cramers_v_results = {}
for col in categorical_features:
    table = pd.crosstab(df_train[col], y)
    chi2, p, dof, expected = chi2_contingency(table)
    n = table.sum().sum()  # Total observations
    r, k = table.shape
    v = cramers_v(chi2, n, k, r)
    cramers_v_results[col] = v

# Print Cramér's V results
for feature, v in cramers_v_results.items():
    print(f"Cramér's V for {feature} and target: {v}")

**Interpretation:** 


High Association: 
* Tuition Fees Up to Date: 0.4472 - This feature has the highest association among the features listed, making it a strong predictor.
* Scholarship Holder: 0.4064 - Also shows a strong association, suggesting it significantly influences or relates to the target.
* Course: 0.3364 - This is another strong indicator, reflecting its relevance to the target.
* Gender: 0.3302 - Significant association, useful for predicting the target.

Moderate Association: 
- Application Mode: 0.2925 - Shows a moderate relationship with the target.
- Debtor: 0.2589 - Relatively moderate, indicating some predictive power.
- Previous Qualification: 0.1916 - Offers some insight but not as strong as the top predictors.
- Mother's Qualification, Mother's Occupation, Father's Qualification, Father's Occupation: These features show similar moderate associations (ranging from 0.1636 to 0.1765), suggesting they hold some predictive value. 

**Consideration** Focus on features with moderate to high Cramér's V values for model building, as these are likely to provide the most predictive power. Consider disregarding or deprioritizing features with very low Cramér's V values in predictive modeling, as they may not contribute significantly to model accuracy. Further investigate combinations of features or interactions that could enhance model performance, especially where moderate associations exist.iations exist.

In [None]:
# Perform ANOVA for multi-class target variable
correlation_results = {}
for feature in numerical_features:
    groups = [df_train[feature][y == category].values for category in y.unique()]
    f_value, p_value = f_oneway(*groups)
    correlation_results[feature] = {'F-value': f_value, 'p-value': p_value}

# Convert the results to a DataFrame for better visualization
correlation_df = pd.DataFrame(correlation_results).T
print(correlation_df)

In [None]:
# Heatmap of the correlation matrix

correlation_matrix=df_train[numerical_features].corr()

plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Calculate Cramér's V statistic for two categorical variables.
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    rcorr = r - ((r-1)**2) / (n-1)
    kcorr = k - ((k-1)**2) / (n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramers_v_matrix = pd.DataFrame(index=categorical_features, columns=categorical_features)

for col1 in categorical_features:
    for col2 in categorical_features:
        if col1 == col2:
            cramers_v_matrix.loc[col1, col2] = 1.0
        else:
            cramers_v_matrix.loc[col1, col2] = cramers_v(df_train[col1], df_train[col2])

# Convert the matrix to numeric for heatmap plotting
cramers_v_matrix = cramers_v_matrix.astype(float)

# Heatmap of the correlation
plt.figure(figsize=(12, 8))
sns.heatmap(cramers_v_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Cramér's V Correlation Heatmap")
plt.show()

In [None]:
# Modeling (voting)

# Map class labels to numerical values, needed for XGBoost 
label_mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
yv = y.map(label_mapping)

# categorical features needed for CatBoost
cat_columns = categorical_features.tolist()


# Parameters of models 
cb_params = {'learning_rate': 0.12146914273449388, 'iterations': 2656, 'depth': 3, 'l2_leaf_reg': 0.8104011685674616, 
                  'border_count': 298, 'random_strength': 0.9482384798609886, 'bagging_temperature': 0.0814623508183759} 
lgbm_params = {'learning_rate': 0.13002653030773764, 'num_leaves': 46, 'max_depth': 45, 'min_data_in_leaf': 72, 'feature_fraction': 0.34580039317123207, 
               'bagging_fraction': 0.9658924212153982}
xgb_params = {'n_estimators': 425, 'learning_rate': 0.04261325690824416, 'max_depth': 6, 'min_child_weight': 7, 'gamma': 0.04543919342670777, 
                      'subsample': 0.9549858251941967, 'colsample_bytree': 0.7217267304024939}

# models 
catboost_model = CatBoostClassifier(random_state=4, cat_features=cat_columns, verbose=False, **cb_params)
lgbm_model = LGBMClassifier(verbose=0, random_state=5, **lgbm_params)
xgb_model = XGBClassifier(verbose=0, random_state=2, enable_categorical=True, **xgb_params)


# Create a VotingClassifier ensemble
voting_clf = VotingClassifier(
    estimators=[('catboost', catboost_model), ('lgbm', lgbm_model)], ('xgb', xgb_model)],
    voting='soft'  # 'soft' uses predicted probabilities, 'hard' uses predicted class labels
)


# Fit the VotingClassifier on the training data
model = voting_clf
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(model, X, yv, scoring='accuracy', cv=kf, n_jobs=-1)

print(f"Accuracy: {np.mean(scores)}")

In [None]:
# Evaluation: confusion matrix

# Initialize lists to store true and predicted labels
true_labels = []
predicted_labels = []

# Loop through each fold
for train_index, val_index in kf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict the validation data
    y_val_pred = model.predict(X_val)
    
    # Append the true and predicted labels to the lists
    true_labels.extend(y_val)
    predicted_labels.extend(y_val_pred)

# Compute the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)
cm_percentage = cm.astype('float') / cm.sum(axis=0)[np.newaxis,:] * 100
disp = ConfusionMatrixDisplay(confusion_matrix=cm_percentage)

# Display the confusion matrix
disp.plot()
plt.show()

In [None]:
# Make predictions on the test dataset

model.fit(X, y)

predictions = model.predict(df_test)

# Reshape predictions array to be 1-dimensional
predictions_flat = predictions.flatten()

# Prepare the submission dataframe
submission = pd.DataFrame({
    'id': submission_id,
    'Target': predictions_flat
})

print('Submission head:', submission.head(10))

# Save the submission dataframe to a CSV file
submission.to_csv('submission.csv', index=False)