In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
def dataframe_to_category_codes(df):
    df = df.astype('category')
    for column in df:
        df[column] = df[column].cat.codes
    return df

# 1. Predict suitable major subjects for student (based on hsc subject grades, olympiads and extra curricular activity)

In [4]:
df = pd.read_excel('Student Records (Cleaned).xlsx')
df

Unnamed: 0,ECA-Sports,ECA-Performance,ECA-Technology,ECA-Clubs,Any Participation in International Olympiad,Any Participation in National Olympiad,Grade In Bangla,Grade In English,Grade In ICT,Grade In Physics,...,Grade In Statistics,Grade In Logic,Grade In History,Grade In Economics,Grade In Geography,Grade In Civics,Grade In Psychology,Cgpa,Major Subject,Job Sector
0,No,No,No,No,No,No,A+,A+,A+,A+,...,U,U,U,U,U,U,U,,Computer Science,
1,No,No,No,No,No,No,A+,A+,A+,A+,...,U,U,U,U,U,U,U,,Electronics,
2,No,Yes,No,No,No,No,A+,A+,A+,A+,...,U,U,U,U,U,U,U,,Architecture,
3,No,No,No,No,No,No,A+,A+,A,U,...,U,U,A+,A+,A+,U,A+,3.1,Environment,Geography
4,No,Yes,No,No,No,Yes,A,A,A+,U,...,U,A+,A,A,A,U,U,2.8,English,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,No,No,No,No,No,No,A+,A+,A,U,...,U,U,A+,A+,A+,U,A+,3.1,Environment,Geography
1095,No,Yes,No,No,No,Yes,A,A,A+,U,...,U,A+,A,A,A,U,U,2.8,History,History
1096,No,Yes,No,No,No,No,A+,A+,A,U,...,U,A+,A+,A+,A+,U,U,3.0,Economics,Economics
1097,No,No,No,No,No,No,A+,A,A+,U,...,U,U,A+,A+,A,U,A+,2.8,Medicine,Medicine


In [5]:
encoded_df = dataframe_to_category_codes(df)
encoded_df

Unnamed: 0,ECA-Sports,ECA-Performance,ECA-Technology,ECA-Clubs,Any Participation in International Olympiad,Any Participation in National Olympiad,Grade In Bangla,Grade In English,Grade In ICT,Grade In Physics,...,Grade In Statistics,Grade In Logic,Grade In History,Grade In Economics,Grade In Geography,Grade In Civics,Grade In Psychology,Cgpa,Major Subject,Job Sector
0,0,0,0,0,0,0,1,1,1,1,...,4,5,5,5,5,3,3,-1,7,-1
1,0,0,0,0,0,0,1,1,1,1,...,4,5,5,5,5,3,3,-1,9,-1
2,0,1,0,0,0,0,1,1,1,1,...,4,5,5,5,5,3,3,-1,2,-1
3,0,0,0,0,0,0,1,1,0,5,...,4,5,1,1,1,3,1,10,11,14
4,0,1,0,0,0,1,0,0,1,5,...,4,1,0,0,0,3,3,7,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,0,0,0,0,0,0,1,1,0,5,...,4,5,1,1,1,3,1,10,11,14
1095,0,1,0,0,0,1,0,0,1,5,...,4,1,0,0,0,3,3,7,15,15
1096,0,1,0,0,0,0,1,1,0,5,...,4,1,1,1,1,3,3,9,8,8
1097,0,0,0,0,0,0,1,0,1,5,...,4,5,1,1,0,3,1,7,22,22


In [6]:
coded_df = pd.concat(
    [df['Major Subject'], encoded_df['Major Subject']],
    axis=1,
    keys=['Major Subject', 'Code']
)
coded_df = coded_df.drop_duplicates().sort_values(by=['Code']).reset_index(drop=True)

print("Major Subject Codes:")
display(coded_df)

Major Subject Codes:


Unnamed: 0,Major Subject,Code
0,Accounting,0
1,Agriculture,1
2,Architecture,2
3,Bangla,3
4,Biology,4
5,Business,5
6,Chemistry,6
7,Computer Science,7
8,Economics,8
9,Electronics,9


In [7]:
features_list = [
    'ECA-Sports', 'ECA-Performance', 'ECA-Technology', 'ECA-Clubs',
    'Any Participation in International Olympiad', 'Any Participation in National Olympiad',
    'Grade In Bangla', 'Grade In English', 'Grade In ICT', 'Grade In Physics',
    'Grade In Chemistry', 'Grade In Math', 'Grade In Biology', 'Grade In Accounting',
    'Grade In Finance', 'Grade In Management', 'Grade In Marketing', 'Grade In Statistics',
    'Grade In Logic', 'Grade In History', 'Grade In Economics', 'Grade In Geography',
    'Grade In Civics', 'Grade In Psychology'
]
label = ['Major Subject']

In [8]:
X = encoded_df[features_list].values
y = encoded_df[label].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [9]:
classifier_knn = KNeighborsClassifier()
classifier_knn.fit(X_train, y_train.ravel())

y_pred_knn = classifier_knn.predict(X_test)

print("k-Nearest Neighbors Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Accuracy Score:", accuracy_score(y_test, y_pred_knn))

k-Nearest Neighbors Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.57      0.53        21
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.53      0.69      0.60        13
           5       0.25      0.33      0.29         3
           6       0.00      0.00      0.00         1
           7       0.64      0.73      0.69        49
           8       0.69      0.69      0.69        16
           9       0.59      0.59      0.59        17
          10       0.67      0.67      0.67         3
          11       1.00      0.56      0.71         9
          12       0.50      1.00      0.67         3
          13       0.50      0.55      0.52        11
          15       0.20      0.20      0.20         5
          17       0.47      0.50      0.48        14
          18       0.00      0.00      0.00         7
          19       0.43      0.43    

In [11]:
classifier_dt = DecisionTreeClassifier(random_state=0)
classifier_dt.fit(X_train, y_train.ravel())

y_pred_dt = classifier_dt.predict(X_test)

print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Accuracy Score:", accuracy_score(y_test, y_pred_dt))

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.48      0.45        21
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.88      0.54      0.67        13
           5       0.14      0.33      0.20         3
           6       0.50      1.00      0.67         1
           7       0.61      0.67      0.64        49
           8       0.68      0.81      0.74        16
           9       0.61      0.82      0.70        17
          10       0.50      0.67      0.57         3
          11       0.83      0.56      0.67         9
          12       0.75      1.00      0.86         3
          13       0.36      0.36      0.36        11
          15       0.20      0.20      0.20         5
          17       0.56      0.36      0.43        14
          18       0.00      0.00      0.00         7
          19       0.57      0.57      0.57

# 2. Predict suitable  job sectors for undergraduates student (based on the data of versity grade, major subjects, extra curricular activity)

In [12]:
df = df.dropna()
df

Unnamed: 0,ECA-Sports,ECA-Performance,ECA-Technology,ECA-Clubs,Any Participation in International Olympiad,Any Participation in National Olympiad,Grade In Bangla,Grade In English,Grade In ICT,Grade In Physics,...,Grade In Statistics,Grade In Logic,Grade In History,Grade In Economics,Grade In Geography,Grade In Civics,Grade In Psychology,Cgpa,Major Subject,Job Sector
3,No,No,No,No,No,No,A+,A+,A,U,...,U,U,A+,A+,A+,U,A+,3.1,Environment,Geography
4,No,Yes,No,No,No,Yes,A,A,A+,U,...,U,A+,A,A,A,U,U,2.8,English,English
5,No,Yes,No,No,No,No,A+,A+,A,U,...,U,A+,A+,A+,A+,U,U,3.0,Economics,Economics
6,No,No,No,No,No,No,A+,A,A+,U,...,U,U,A+,A+,A,U,A+,2.8,Sociology,Sociology
7,No,Yes,No,No,No,No,A,A-,A,U,...,U,A+,A-,A,A-,U,U,2.5,Environment,Geography
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,No,No,No,No,No,No,A+,A+,A,U,...,U,U,A+,A+,A+,U,A+,3.1,Environment,Geography
1095,No,Yes,No,No,No,Yes,A,A,A+,U,...,U,A+,A,A,A,U,U,2.8,History,History
1096,No,Yes,No,No,No,No,A+,A+,A,U,...,U,A+,A+,A+,A+,U,U,3.0,Economics,Economics
1097,No,No,No,No,No,No,A+,A,A+,U,...,U,U,A+,A+,A,U,A+,2.8,Medicine,Medicine


In [13]:
encoded_df = dataframe_to_category_codes(df)
encoded_df

Unnamed: 0,ECA-Sports,ECA-Performance,ECA-Technology,ECA-Clubs,Any Participation in International Olympiad,Any Participation in National Olympiad,Grade In Bangla,Grade In English,Grade In ICT,Grade In Physics,...,Grade In Statistics,Grade In Logic,Grade In History,Grade In Economics,Grade In Geography,Grade In Civics,Grade In Psychology,Cgpa,Major Subject,Job Sector
3,0,0,0,0,0,0,1,1,0,5,...,4,5,1,1,1,3,1,10,11,14
4,0,1,0,0,0,1,0,0,1,5,...,4,1,0,0,0,3,3,7,10,10
5,0,1,0,0,0,0,1,1,0,5,...,4,1,1,1,1,3,3,9,8,8
6,0,0,0,0,0,0,1,0,1,5,...,4,5,1,1,0,3,1,7,25,25
7,0,1,0,0,0,0,0,2,0,5,...,4,1,2,0,2,3,3,4,11,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,0,0,0,0,0,0,1,1,0,5,...,4,5,1,1,1,3,1,10,11,14
1095,0,1,0,0,0,1,0,0,1,5,...,4,1,0,0,0,3,3,7,15,15
1096,0,1,0,0,0,0,1,1,0,5,...,4,1,1,1,1,3,3,9,8,8
1097,0,0,0,0,0,0,1,0,1,5,...,4,5,1,1,0,3,1,7,22,22


In [14]:
coded_df = pd.concat(
    [df['Job Sector'], encoded_df['Job Sector']],
    axis=1,
    keys=['Job Sector', 'Code']
)
coded_df = coded_df.drop_duplicates().sort_values(by=['Code']).reset_index(drop=True)

print("Job Sector Codes:")
display(coded_df)

Job Sector Codes:


Unnamed: 0,Job Sector,Code
0,Accounting,0
1,Agriculture,1
2,Architecture,2
3,Bangla,3
4,Biology,4
5,Business,5
6,Chemistry,6
7,Computer Science,7
8,Economics,8
9,Electronics,9


In [15]:
features_list = [
    'ECA-Sports', 'ECA-Performance', 'ECA-Technology', 'ECA-Clubs', 'Cgpa', 'Major Subject'
]
label = ['Job Sector']

In [16]:
X = encoded_df[features_list].values
y = encoded_df[label].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [17]:
classifier_knn = KNeighborsClassifier()
classifier_knn.fit(X_train, y_train.ravel())

y_pred_knn = classifier_knn.predict(X_test)

print("k-Nearest Neighbors Classification Report:\n", classification_report(y_test, y_pred_knn))
print("Accuracy Score:", accuracy_score(y_test, y_pred_knn))

k-Nearest Neighbors Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.92      0.83        13
           1       0.00      0.00      0.00         1
           2       1.00      0.67      0.80         3
           3       0.00      0.00      0.00         1
           4       0.67      0.80      0.73         5
           5       0.80      0.50      0.62         8
           7       0.72      0.94      0.82        31
           8       0.78      0.70      0.74        10
           9       0.86      0.80      0.83        15
          10       1.00      0.80      0.89         5
          11       0.87      1.00      0.93        13
          12       1.00      1.00      1.00         1
          13       0.78      0.58      0.67        12
          14       0.00      0.00      0.00         2
          15       1.00      1.00      1.00         1
          16       0.00      0.00      0.00         1
          17       0.70      0.70    

In [18]:
classifier_dt = DecisionTreeClassifier(random_state=0)
classifier_dt.fit(X_train, y_train.ravel())

y_pred_dt = classifier_dt.predict(X_test)

print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Accuracy Score:", accuracy_score(y_test, y_pred_dt))

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93        13
           1       1.00      1.00      1.00         1
           2       0.75      1.00      0.86         3
           3       0.00      0.00      0.00         1
           4       0.80      0.80      0.80         5
           5       1.00      0.50      0.67         8
           7       0.81      0.94      0.87        31
           8       0.54      0.70      0.61        10
           9       1.00      0.80      0.89        15
          10       0.83      1.00      0.91         5
          11       0.87      1.00      0.93        13
          12       1.00      1.00      1.00         1
          13       0.58      0.58      0.58        12
          14       0.00      0.00      0.00         2
          15       1.00      1.00      1.00         1
          16       0.00      0.00      0.00         1
          17       0.75      0.60      0.67