In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
df = pd.read_csv("/content/Data4.csv")

# Split the dataset into features (X) and target (y)
X = df.drop(columns=["StudentID", "Scholarship"])
y = df["Scholarship"]

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode target labels
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
train_score = rf_model.score(X_train, y_train)
test_score = rf_model.score(X_test, y_test)

print("Training accuracy:", train_score)
print("Testing accuracy:", test_score)

Training accuracy: 1.0
Testing accuracy: 0.9867998572957546


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
df = pd.read_csv("/content/Data4.csv")

# Split the dataset into features (X) and target (y)
X = df.drop(columns=["StudentID", "Scholarship"])
y = df["Scholarship"]

# Encode categorical features
X_encoded = pd.get_dummies(X)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode target labels
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Train Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate models
lr_train_score = lr_model.score(X_train, y_train)
lr_test_score = lr_model.score(X_test, y_test)

rf_train_score = rf_model.score(X_train, y_train)
rf_test_score = rf_model.score(X_test, y_test)

svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)

# Print accuracy scores
print("Logistic Regression - Training accuracy:", lr_train_score)
print("Logistic Regression - Testing accuracy:", lr_test_score)
print("Random Forest - Training accuracy:", rf_train_score)
print("Random Forest - Testing accuracy:", rf_test_score)
print("SVM - Training accuracy:", svm_train_score)
print("SVM - Testing accuracy:", svm_test_score)

# Save the model with the best testing accuracy
best_model = max([(lr_model, lr_test_score), (rf_model, rf_test_score), (svm_model, svm_test_score)], key=lambda x: x[1])[0]

# Save the best model to a file
import joblib
joblib.dump(best_model, 'best_model.pkl')

Logistic Regression - Training accuracy: 0.9246966452533905
Logistic Regression - Testing accuracy: 0.9225829468426686
Random Forest - Training accuracy: 1.0
Random Forest - Testing accuracy: 0.9867998572957546
SVM - Training accuracy: 0.9750178443968593
SVM - Testing accuracy: 0.9500535140920442


['best_model.pkl']