In [172]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [173]:
data = pd.read_csv("student-data.csv")

In [174]:
data['failed'] = data['passed'].apply(lambda x: 1 if x == 'no' else 0)

# Drop unnecessary columns (like 'passed' since it's the target variable)
data = data.drop("passed", axis=1)

# Handle missing values (if any)
data.dropna(inplace=True)



In [175]:
# Encode binary categorical variables
label_encoder = LabelEncoder()
binary_columns = ["school", "sex", "address", "famsize", "Pstatus", "internet", "romantic"]
for column in binary_columns:
    data[column] = label_encoder.fit_transform(data[column])



In [176]:
# Encode nominal categorical variables (one-hot encoding)
nominal_columns = ["Mjob", "Fjob", "reason", "guardian"]
data = pd.get_dummies(data, columns=nominal_columns, drop_first=True)



In [177]:
print(data.dtypes)

school                int32
sex                   int32
age                   int64
address               int32
famsize               int32
Pstatus               int32
Medu                  int64
Fedu                  int64
traveltime            int64
studytime             int64
failures              int64
schoolsup            object
famsup               object
paid                 object
activities           object
nursery              object
higher               object
internet              int32
romantic              int32
famrel                int64
freetime              int64
goout                 int64
Dalc                  int64
Walc                  int64
health                int64
absences              int64
failed                int64
Mjob_health           uint8
Mjob_other            uint8
Mjob_services         uint8
Mjob_teacher          uint8
Fjob_health           uint8
Fjob_other            uint8
Fjob_services         uint8
Fjob_teacher          uint8
reason_home         

In [178]:
object_columns = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet']
for column in object_columns:
    print(f"Unique values for {column}: {data[column].unique()}")

Unique values for schoolsup: ['yes' 'no']
Unique values for famsup: ['no' 'yes']
Unique values for paid: ['no' 'yes']
Unique values for activities: ['no' 'yes']
Unique values for nursery: ['yes' 'no']
Unique values for higher: ['yes' 'no']
Unique values for internet: [0 1]


In [179]:
# Encode 'yes' and 'no' in binary columns
binary_columns = ["schoolsup", "famsup", "paid", "activities", "nursery", "higher"]
for column in binary_columns:
    data[column] = label_encoder.fit_transform(data[column])

In [180]:
# Split the data into features and target variable
X = data.drop("failed", axis=1)
y = data["failed"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a classification model (Random Forest as an example)
model =RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)



In [181]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.6708860759493671


In [182]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        52
           1       0.55      0.22      0.32        27

    accuracy                           0.67        79
   macro avg       0.62      0.56      0.55        79
weighted avg       0.64      0.67      0.62        79



In [183]:
# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": feature_importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Display the top N important features
top_features = feature_importance_df.head(10)  # Change 10 to the desired number
print("\nTop Features:")
print(top_features)


Top Features:
      Feature  Importance
25   absences    0.090627
10   failures    0.074637
21      goout    0.051141
2         age    0.049840
24     health    0.046414
6        Medu    0.041936
20   freetime    0.041206
7        Fedu    0.039606
23       Walc    0.039135
9   studytime    0.036952


In [185]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model_rf.fit(X_train, y_train)

# Get feature importances
feature_importances_rf = model_rf.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df_rf = pd.DataFrame({"Feature": X.columns, "Importance": feature_importances_rf})
feature_importance_df_rf = feature_importance_df_rf.sort_values(by="Importance", ascending=False)

# Display the top N important features
top_features_rf = feature_importance_df_rf.head(10)  # Change 10 to the desired number
print("\nTop Features (Random Forest):")
print(top_features_rf)



Top Features (Random Forest):
      Feature  Importance
25   absences    0.096962
10   failures    0.072243
2         age    0.056099
21      goout    0.052515
6        Medu    0.042796
7        Fedu    0.041700
24     health    0.040681
20   freetime    0.039165
19     famrel    0.038729
9   studytime    0.036242
