# Load Libraries and Dataset

In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
import string

In [None]:
# Download NLTK stopwords if not already available
nltk.download('stopwords')

In [None]:
# Load the dataset (replace 'your_dataset.csv' with your actual dataset file)
data = pd.read_csv('phd_training.xlsx')

In [None]:
# Check the data
data.head()

# Text Preprocessing

In [None]:
# Define text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [None]:
# Apply preprocessing
data['cleaned_text'] = data['Descriptions'].apply(preprocess_text)

In [None]:
# Check the cleaned data
data[['Descriptions', 'cleaned_text']].head()

# TF-IDF Vectorization

In [None]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 most important features

In [None]:
# Fit and transform the cleaned text into TF-IDF features
X = tfidf.fit_transform(data['cleaned_text']).toarray()

# Labels (assuming the column 'phd' contains 0 for Standard and 1 for PhD/Research)
y = data['phd']

# Train-Test Split

In [None]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the train and test sets
X_train.shape, X_test.shape

# Train the SVM Model

In [None]:
# Initialize the SVM model
svm_model = SVC(kernel='linear')  # Using a linear kernel for text classification

In [None]:
# Train the model on the training data
svm_model.fit(X_train, y_train)

# Model Evaluation

In [None]:
# Predict the labels on the test set
y_pred = svm_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# Print a classification report (includes precision, recall, F1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Save the Model

In [None]:
import joblib

# Save the trained model
joblib.dump(svm_model, 'svm_phd_offer_classifier.pkl')