In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import ndcg_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
import matplotlib.pyplot as plt

# Load and preprocess data
mahasiswa_data = pd.read_csv('dataset.csv')
siswa_data = pd.read_csv('user_a.csv')

# Preprocess text data
mahasiswa_data['Content'] = mahasiswa_data[['Program Studi']].apply(lambda x: ' '.join(x), axis=1)

# Create vocabulary and convert text to sequences
vocab_size = 1000  # Define your vocabulary size
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(mahasiswa_data['Content'])

# Convert text data to sequences
sequences = tokenizer.texts_to_sequences(mahasiswa_data['Content'])
word_index = tokenizer.word_index

# Prepare input data and labels
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_sequence_length)

# Create model
embedding_dim = 50  # Define your embedding dimension
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(len(mahasiswa_data), activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the model (assuming you have labels for the programs)
# Replace 'labels' with appropriate labels from your dataset
labels = mahasiswa_data.index.values
model.fit(padded_sequences, labels, epochs=40, batch_size=32)


def get_recommendations(student_name, student_interest):
    # Process student interest
    interest_sequence = tokenizer.texts_to_sequences([student_interest])
    padded_interest_sequence = tf.keras.preprocessing.sequence.pad_sequences(interest_sequence,
                                                                             maxlen=max_sequence_length)

    # Get prediction from the model
    prediction = model.predict(padded_interest_sequence)

    # Get top recommendations based on prediction
    top_recommendations_indices = prediction.argsort()[0][::-1]
    top_recommendations = mahasiswa_data.iloc[top_recommendations_indices]['Nama']

    return {
        'Student Name': student_name,
        'Recommendations': top_recommendations.values.tolist()
    }


# Collect recommendations and NDCG for each high school student
recommendations_list = []
for index, row in siswa_data.iterrows():
    student_name = row['Nama']
    student_interest = row['Minat']
    recommendations = get_recommendations(student_name, student_interest)

    # Calculate NDCG
    true_labels = [1 if mahasiswa in recommendations['Recommendations'] else 0 for mahasiswa in mahasiswa_data['Nama']]
    predicted_scores = [1 if mahasiswa in recommendations['Recommendations'] else 0 for mahasiswa in mahasiswa_data['Nama']]
    ndcg_value = ndcg_score([true_labels], [predicted_scores], k=len(mahasiswa_data))

    recommendations['NDCG'] = ndcg_value
    recommendations_list.append(recommendations)

# Convert recommendations to a DataFrame
recommendations_df = pd.DataFrame(recommendations_list)

# Save recommendations to a CSV file
recommendations_df.to_csv('recommendations_with_ndcg.csv', index=False)

# get NDCG score from Dataframe
ndcg_values = recommendations_df['NDCG'].values

# Plotting NDCG score
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(ndcg_values) + 1), ndcg_values, marker='o', linestyle='-')
plt.title('NDCG Scores for Each Student')
plt.xlabel('Student Index')
plt.ylabel('NDCG Score')
plt.grid(True)
plt.show()