## Part-02: Using CatBoost Model

In [7]:
# Importing Libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [8]:
import catboost
# Print the CatBoost version
print("\n CatBoost version:", catboost.__version__)


 CatBoost version: 1.2.2


In [9]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier

In [10]:
# Importing the Dataset
student_data = pd.read_csv('../Data/student_data_with_placed_colleges.csv') 

In [11]:
# Data Preparation

# Prepare features and target variables
X = student_data[['Age', 'Socioeconomic_Background', 'SSC_Marks_Percentage', 'HSC_Marks_Percentage', 'MHTCET_Scores_Percentile', 'JEE_Mains_Scores_Percentile']]
y_college = student_data['College']
y_branch = student_data['Branch']

# Encode categorical variables (Socioeconomic_Background)
le = LabelEncoder()
X['Socioeconomic_Background'] = le.fit_transform(X['Socioeconomic_Background'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Socioeconomic_Background'] = le.fit_transform(X['Socioeconomic_Background'])


In [12]:
# Train-Test Split and Scaling

# Split the data into training and testing sets
X_train, X_test, y_college_train, y_college_test, y_branch_train, y_branch_test = train_test_split(
    X, y_college, y_branch, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Train ExtraTrees Models

college_model = ExtraTreesClassifier(random_state=42)
branch_model = ExtraTreesClassifier(random_state=42)

college_model.fit(X_train, y_college_train)
branch_model.fit(X_train, y_branch_train)

In [14]:
# Evaluate ExtraTrees Models

# Predict College and Branch for the test data
y_college_pred = college_model.predict(X_test)
y_branch_pred = branch_model.predict(X_test)

# Evaluate the models
college_accuracy = accuracy_score(y_college_test, y_college_pred)
branch_accuracy = accuracy_score(y_branch_test, y_branch_pred)

print("College Prediction Accuracy:", college_accuracy)
print("Branch Prediction Accuracy:", branch_accuracy)

# Classification Report for more detailed metrics
print("Classification Report for College:\n", classification_report(y_college_test, y_college_pred))
print("Classification Report for Branch:\n", classification_report(y_branch_test, y_branch_pred))


College Prediction Accuracy: 0.903
Branch Prediction Accuracy: 0.841
Classification Report for College:
               precision    recall  f1-score   support

        LTCE       0.36      0.10      0.16        49
Not eligible       0.00      0.00      0.00        15
         RCE       0.95      0.99      0.97       708
        TCET       0.76      0.78      0.77       103
        VJTI       0.75      0.90      0.82        20
         XIE       0.87      0.97      0.92       105

    accuracy                           0.90      1000
   macro avg       0.61      0.62      0.61      1000
weighted avg       0.87      0.90      0.88      1000

Classification Report for Branch:
                                                            precision    recall  f1-score   support

                 Artificial Intelligence and Data Science       0.33      0.27      0.30        11
                                     Chemical Engineering       1.00      0.12      0.22         8
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Cell 7: Train CatBoost Models

# Prepare features and target variables
X = student_data[['Age', 'Socioeconomic_Background', 'SSC_Marks_Percentage', 'HSC_Marks_Percentage', 'MHTCET_Scores_Percentile', 'JEE_Mains_Scores_Percentile']]
y_college = student_data['College']
y_branch = student_data['Branch']

# Split the data into training and testing sets
X_train, X_test, y_college_train, y_college_test, y_branch_train, y_branch_test = train_test_split(
    X, y_college, y_branch, test_size=0.2, random_state=42)

# Define categorical features for CatBoost
cat_features = ['Socioeconomic_Background']

college_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass', cat_features=cat_features, random_seed=42)
branch_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass', cat_features=cat_features, random_seed=42)

college_model.fit(X_train, y_college_train)
branch_model.fit(X_train, y_branch_train)


0:	learn: 1.4300569	total: 232ms	remaining: 3m 52s
1:	learn: 1.2135964	total: 282ms	remaining: 2m 20s
2:	learn: 1.0644164	total: 333ms	remaining: 1m 50s
3:	learn: 0.9456566	total: 391ms	remaining: 1m 37s
4:	learn: 0.8531083	total: 437ms	remaining: 1m 27s
5:	learn: 0.7806478	total: 475ms	remaining: 1m 18s
6:	learn: 0.7152886	total: 518ms	remaining: 1m 13s
7:	learn: 0.6577055	total: 557ms	remaining: 1m 9s
8:	learn: 0.6102666	total: 599ms	remaining: 1m 5s
9:	learn: 0.5640766	total: 635ms	remaining: 1m 2s
10:	learn: 0.5254424	total: 677ms	remaining: 1m
11:	learn: 0.4940882	total: 714ms	remaining: 58.8s
12:	learn: 0.4615440	total: 752ms	remaining: 57.1s
13:	learn: 0.4342386	total: 787ms	remaining: 55.4s
14:	learn: 0.4098884	total: 828ms	remaining: 54.3s
15:	learn: 0.3898318	total: 863ms	remaining: 53.1s
16:	learn: 0.3722677	total: 902ms	remaining: 52.2s
17:	learn: 0.3558008	total: 938ms	remaining: 51.2s
18:	learn: 0.3435164	total: 978ms	remaining: 50.5s
19:	learn: 0.3306057	total: 1.01s	rem

<catboost.core.CatBoostClassifier at 0x1ddefc446a0>

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate CatBoost Models

# Predict College and Branch for the test data
y_college_pred = college_model.predict(X_test)
y_branch_pred = branch_model.predict(X_test)

# Evaluate the models
college_accuracy = accuracy_score(y_college_test, y_college_pred)
branch_accuracy = accuracy_score(y_branch_test, y_branch_pred)

print("College Prediction Accuracy:", college_accuracy)
print("Branch Prediction Accuracy:", branch_accuracy)

# Classification Report for more detailed metrics
print("Classification Report for College:\n", classification_report(y_college_test, y_college_pred))
print("Classification Report for Branch:\n", classification_report(y_branch_test, y_branch_pred))

College Prediction Accuracy: 0.924
Branch Prediction Accuracy: 0.874
Classification Report for College:
               precision    recall  f1-score   support

        LTCE       0.59      0.47      0.52        49
Not eligible       0.20      0.13      0.16        15
         RCE       0.97      0.98      0.97       708
        TCET       0.80      0.83      0.82       103
        VJTI       1.00      1.00      1.00        20
         XIE       0.92      0.96      0.94       105

    accuracy                           0.92      1000
   macro avg       0.75      0.73      0.74      1000
weighted avg       0.92      0.92      0.92      1000

Classification Report for Branch:
                                                            precision    recall  f1-score   support

                 Artificial Intelligence and Data Science       0.45      0.45      0.45        11
                                     Chemical Engineering       0.50      0.25      0.33         8
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Assuming you have a new data point as a DataFrame
new_data_point = pd.DataFrame({
    'Age': [18],
    'Socioeconomic_Background': [1],
    'SSC_Marks_Percentage': [2],
    'HSC_Marks_Percentage': [10],
    'MHTCET_Scores_Percentile': [91.25],
    'JEE_Mains_Scores_Percentile': [99.31]
})

# Predict using the trained CatBoost model for College
pred_college = college_model.predict(new_data_point)

# If needed, you can convert the prediction to a human-readable label
# Note: CatBoost directly provides class labels as strings
predicted_college = pred_college[0]
print("Predicted College:", predicted_college)

# Predict using the trained CatBoost model for Branch
pred_branch = branch_model.predict(new_data_point)

# If needed, you can convert the prediction to a human-readable label
# Note: CatBoost directly provides class labels as strings
predicted_branch = pred_branch[0]
print("Predicted Branch:", predicted_branch)

Predicted College: ['XIE']
Predicted Branch: ['Electronics and Telecommunication Engg']


## Part-02: Using Neural Networks Model

In [18]:
# Part 1: Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Print the tensorflow version
print("TensorFlow version:", tf.__version__)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Part 2: Importing the Dataset
student_data = pd.read_csv('/content/student_data_with_placed_colleges.csv')

In [None]:
student_data.head()

In [None]:
# Part 3: Data Preparation

# Prepare features and target variables
X = pd.get_dummies(student_data[['Age', 'Socioeconomic_Background', 'SSC_Marks_Percentage', 'HSC_Marks_Percentage', 'MHTCET_Scores_Percentile', 'JEE_Mains_Scores_Percentile']])
y_college = pd.get_dummies(student_data['College'])
y_branch = pd.get_dummies(student_data['Branch'])

# Split the data into training and testing sets
X_train, X_test, y_college_train, y_college_test, y_branch_train, y_branch_test = train_test_split(
    X, y_college, y_branch, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initializing the ANN [ann_cm : College Model]
ann_cm = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer
ann_cm.add(tf.keras.layers.Dense(units=6, activation='relu'))

# Adding the second hidden layer
ann_cm.add(tf.keras.layers.Dense(units=6, activation='relu'))

# Adding the output layer for multiple classes
# Number of units should be equal to the number of classes
# Use softmax activation for multiple classes
ann_cm.add(tf.keras.layers.Dense(units=y_college_train.shape[1], activation='softmax'))

# Compiling the ANN
ann_cm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the ANN on the Training set
ann_cm.fit(X_train, y_college_train, batch_size=32, epochs=100)

In [None]:
# Initializing the ANN [ann_bm : Branch Model]
ann_bm = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer
ann_bm.add(tf.keras.layers.Dense(units=6, activation='relu', input_dim=X_train.shape[1]))

# Adding the second hidden layer
ann_bm.add(tf.keras.layers.Dense(units=6, activation='relu'))

# Adding the output layer for multi-class classification
# Change the number of units to match the number of branches (12 in this case)
ann_bm.add(tf.keras.layers.Dense(units=13, activation='softmax'))

# Compiling the ANN
ann_bm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the ANN on the Training set
ann_bm.fit(X_train, y_branch_train, batch_size=32, epochs=100)

In [None]:
# Evaluate ANN Models

# Evaluate the College model
college_loss, college_accuracy = ann_cm.evaluate(X_test, y_college_test)
print("College Prediction Accuracy:", college_accuracy)

In [None]:
# Assuming you have a new data point as a DataFrame
new_data_point = pd.DataFrame({
    'Age': [18],
    'Socioeconomic_Background': ['Open'],  # Provide the appropriate choice
    'SSC_Marks_Percentage': [2],
    'HSC_Marks_Percentage': [10],
    'MHTCET_Scores_Percentile': [91.25],
    'JEE_Mains_Scores_Percentile': [99.31]
})

# Encode categorical variable using one-hot encoding
new_data_point_encoded = pd.get_dummies(new_data_point, columns=['Socioeconomic_Background'], drop_first=True)

# Ensure that the columns in the new data point match the columns used during training
missing_columns = set(X.columns) - set(new_data_point_encoded.columns)
for column in missing_columns:
    new_data_point_encoded[column] = 0  # Add missing columns with default value 0

# Reorder columns to match the order during training
new_data_point_encoded = new_data_point_encoded[X.columns]

# Scale the new data point
new_data_point_scaled = scaler.transform(new_data_point_encoded)

# Predict using the trained College ANN model
pred_college_prob = ann_cm.predict(new_data_point_scaled)
predicted_college = np.argmax(pred_college_prob, axis=1)[0]
print("Predicted College:", predicted_college)


In [None]:
# Evaluate the Branch model
branch_loss, branch_accuracy = ann_bm.evaluate(X_test, y_branch_test)
print("Branch Prediction Accuracy:", branch_accuracy)

In [None]:
# Assuming you have a new data point as a DataFrame
new_data_point = pd.DataFrame({
    'Age': [18],
    'Socioeconomic_Background': ['Open'],  # Provide the appropriate choice
    'SSC_Marks_Percentage': [2],
    'HSC_Marks_Percentage': [10],
    'MHTCET_Scores_Percentile': [91.25],
    'JEE_Mains_Scores_Percentile': [99.31]
})

# Encode categorical variable using one-hot encoding
new_data_point_encoded = pd.get_dummies(new_data_point, columns=['Socioeconomic_Background'], drop_first=True)

# Ensure that the columns in the new data point match the columns used during training
missing_columns = set(X.columns) - set(new_data_point_encoded.columns)
for column in missing_columns:
    new_data_point_encoded[column] = 0  # Add missing columns with default value 0

# Reorder columns to match the order during training
new_data_point_encoded = new_data_point_encoded[X.columns]

# Scale the new data point
new_data_point_scaled = scaler.transform(new_data_point_encoded)

# Predict using the trained College ANN model
pred_branch_prob = ann_bm.predict(new_data_point_scaled)
predicted_branch = np.argmax(pred_branch_prob, axis=1)[0]
print("Predicted College:", predicted_branch)

## 3 Using Multi-linear Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

# Assuming your dataset is named 'student_data'
# Convert categorical variables to numeric using Label Encoding
label_encoder = LabelEncoder()

# Encode 'Gender', 'Socioeconomic_Background', 'Student_Location', and 'Top_Choices'
categorical_columns = ['Gender', 'Socioeconomic_Background', 'Student_Location', 'Top_Choices']
for col in categorical_columns:
    student_data[col] = label_encoder.fit_transform(student_data[col])

# Split the data into features (X) and target variables (y)
X = student_data.drop(['College', 'Branch'], axis=1)  # Features (all attributes except 'College' and 'Branch')
y_college = student_data['College']  # Target variable for College
y_branch = student_data['Branch']    # Target variable for Branch

# Split the data into training and testing sets
X_train, X_test, y_college_train, y_college_test, y_branch_train, y_branch_test = train_test_split(X, y_college, y_branch, test_size=0.2, random_state=42)

# Train linear regression models for College and Branch
college_model = LinearRegression()
college_model.fit(X_train, y_college_train)

branch_model = LinearRegression()
branch_model.fit(X_train, y_branch_train)

# Predictions on the test set
y_college_pred = college_model.predict(X_test)
y_branch_pred = branch_model.predict(X_test)

In [None]:
# Importing the Dataset
student_data = pd.read_csv('/content/student_data_with_placed_colleges.csv')

In [None]:
for column in student_data.columns:
    unique_values = student_data[column].unique()
    print(f"Unique values for {column}: {unique_values}")

**Linear Regression**

## 4

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Prepare features and target variables
X = student_data[['Age', 'Socioeconomic_Background', 'SSC_Marks_Percentage', 'HSC_Marks_Percentage', 'MHTCET_Scores_Percentile', 'JEE_Mains_Scores_Percentile']]
y_college = student_data['College']
y_branch = student_data['Branch']

# Encode categorical variables (socioeconomic_Background)
le = LabelEncoder()
X['Socioeconomic_Background'] = le.fit_transform(X['Socioeconomic_Background'])

# Split the data into training and testing sets
X_train, X_test, y_college_train, y_college_test, y_branch_train, y_branch_test = train_test_split(
    X, y_college, y_branch, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train ExtraTrees models for College and Branch
college_model = ExtraTreesClassifier(random_state=42)
branch_model = ExtraTreesClassifier(random_state=42)

college_model.fit(X_train, y_college_train)
branch_model.fit(X_train, y_branch_train)

# Predict College and Branch for the test data
y_college_pred = college_model.predict(X_test)
y_branch_pred = branch_model.predict(X_test)

# Evaluate the models
college_accuracy = accuracy_score(y_college_test, y_college_pred)
branch_accuracy = accuracy_score(y_branch_test, y_branch_pred)

print("College Prediction Accuracy:", college_accuracy)
print("Branch Prediction Accuracy:", branch_accuracy)

# You can also print a classification report for more detailed metrics
print("Classification Report for College:\n", classification_report(y_college_test, y_college_pred))
print("Classification Report for Branch:\n", classification_report(y_branch_test, y_branch_pred))


In [None]:
pip install catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Prepare features and target variables
X = student_data[['Age', 'Socioeconomic_Background', 'SSC_Marks_Percentage', 'HSC_Marks_Percentage', 'MHTCET_Scores_Percentile', 'JEE_Mains_Scores_Percentile']]
y_college = student_data['College']
y_branch = student_data['Branch']

# Split the data into training and testing sets
X_train, X_test, y_college_train, y_college_test, y_branch_train, y_branch_test = train_test_split(
    X, y_college, y_branch, test_size=0.2, random_state=42)

# Define CatBoost models for College and Branch
college_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass', cat_features=['Socioeconomic_Background'], random_seed=42)
branch_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass', cat_features=['Socioeconomic_Background'], random_seed=42)

# Train CatBoost models for College and Branch
college_model.fit(X_train, y_college_train)
branch_model.fit(X_train, y_branch_train)

# Predict College and Branch for the test data
y_college_pred = college_model.predict(X_test)
y_branch_pred = branch_model.predict(X_test)

# Evaluate the models
college_accuracy = accuracy_score(y_college_test, y_college_pred)
branch_accuracy = accuracy_score(y_branch_test, y_branch_pred)

print("College Prediction Accuracy:", college_accuracy)
print("Branch Prediction Accuracy:", branch_accuracy)

# You can also print a classification report for more detailed metrics
print("Classification Report for College:\n", classification_report(y_college_test, y_college_pred))
print("Classification Report for Branch:\n", classification_report(y_branch_test, y_branch_pred))


# Creating requirements file
