In [89]:
import pandas as pd

In [90]:
df_en = pd.read_excel("Dataset_EFREI_en.xlsx")

# Pre-processing

In [91]:
import nltk
import re
import string
nltk.download('words')

words = set(nltk.corpus.words.words())


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [92]:
df_en['question_lower'] = df_en['Question'].str.lower()
df_en['responce_lower'] = df_en['Answer'].str.lower()


# Remove URLs

In [93]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


# Remove mentions and hashtags

In [94]:
def remove_mentions_hashtags(text):
  text = re.sub('@[A-Za-z0-9_]+',"", text)
  text = re.sub("[0-9][A-Za-z0-9_]+","", text)
  text = re.sub('lax',"", text)
  text = re.sub('flight',"", text)
  return re.sub("#[A-Za-z0-9_]+","", text)


df_en['question_cleaned'] = df_en['question_lower'].apply(lambda text: remove_mentions_hashtags(text))
df_en['responce_cleaned'] = df_en['responce_lower'].apply(lambda text: remove_mentions_hashtags(text))


# Remove Punctuation

In [95]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df_en["question_punct"] = df_en["question_cleaned"].apply(lambda text: remove_punctuation(text))
df_en["responce_punct"] = df_en["responce_cleaned"].apply(lambda text: remove_punctuation(text))


# Remove Non English Words


In [96]:
def remove_non_english(text):
  return " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

df_en['question_punct'] = df_en['question_punct'].apply(lambda text: remove_non_english(text))
df_en['responce_punct'] = df_en['responce_punct'].apply(lambda text: remove_non_english(text))




# Remove Emojis

In [97]:
def remove_emojis(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U0001F383"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


df_en["question_cleaned"] = df_en["question_punct"].apply(lambda text: remove_emojis(text))
df_en["responce_cleaned"] = df_en["responce_punct"].apply(lambda text: remove_emojis(text))


# Word Lemmatizer

In [98]:
import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word,'v') for word in text.split()])


df_en["question_lemmatized"] = df_en["question_cleaned"].apply(lambda text: lemmatize_words(text))
df_en["responce_lemmatized"] = df_en["responce_cleaned"].apply(lambda text: lemmatize_words(text))

df_en['question_lemmatized'][1]

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'degree be in and be open to international'

# Word Tokenize

In [None]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def tokenizing(text):
  return word_tokenize(text)

df_en['word_question'] = df_en['question_lemmatized'].apply(lambda text: tokenizing(text))
df_en['word_responce'] = df_en['responce_lemmatized'].apply(lambda text: tokenizing(text))


df_en['word_question'].head

# Remove stop-words


In [82]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [83]:
def remove_stop_words(words):
  return [word for word in words if not word in stopwords.words('english')]

df_en['words_question_cleaned'] = df_en['word_question'].apply(lambda text: remove_stop_words(text))
df_en['words_responce_cleaned'] = df_en['word_responce'].apply(lambda text: remove_stop_words(text))

df_en['words_question_cleaned'][1]

['degree', 'open', 'international']

In [84]:
df_en.to_excel('data_en.xlsx')

#Doc2Vec model

In [100]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

# Load your dataset
df_en = pd.read_excel("data_en.xlsx")

# Pre-processing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub('@[A-Za-z0-9_]+', '', text)
    text = re.sub("#[A-Za-z0-9_]+", '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

# Apply preprocessing to questions and answers
df_en['question_tokens'] = df_en['Question'].apply(preprocess_text)
df_en['answer_tokens'] = df_en['Answer'].apply(preprocess_text)

# Tag documents for Doc2Vec model
tagged_data = [TaggedDocument(words=question, tags=[index])
               for index, question in enumerate(df_en['question_tokens'])]

# Train Doc2Vec model
vector_size = 100  # Adjust the vector size based on your dataset and requirements
max_epochs = 100   # Increase epochs for better training
model = Doc2Vec(vector_size=vector_size, window=2, min_count=1, workers=4, epochs=max_epochs)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Save the trained model
model.save("doc2vec_model")

# Load the trained model
model = Doc2Vec.load("doc2vec_model")



#Testing with Questions from Dataset

In [101]:
# Print similar questions for every +10 question
c = 0
for i in range(0, 250, 60):  # Loop every +10
    c += 1
    try:
        q = df_en['question_tokens'][i]
        new_question_embedding = model.infer_vector(q)

        # Use the embedding to retrieve similar questions from the training data
        similar_questions = model.docvecs.most_similar([new_question_embedding],topn=1)

        print(f"Similar questions for question {c}:")
        for index, similarity in similar_questions:
            print("Similarity:", str(round(similarity*100,1)) + "%")
            print("Question:", df_en['Question'][index])
            print("Answer:", df_en['Answer'][index])
            print()

    except Exception as e:
        print(f"Error processing question {i}: {e}")


Similar questions for question 1:
Similarity: 97.8%
Question: degree programs are offered in English and are open to international candidates ?
Answer: you will find all information regarding our undergraduate programs at https://eng.efrei.fr/graduate-programs/.
followed by a 2 year master degree program in the areas listed at https://eng.efrei.fr/graduate-programs/.

Similar questions for question 2:
Similarity: 96.3%
Question: What are the requirements and list of required admission documents for an exchange research internship ?
Answer: please find the details for your application for an exchange program or research internship at https://eng.efrei.fr/international-admission/application-for-an-exchange-program/.

Similar questions for question 3:
Similarity: 98.5%
Question: What are the deadlines to apply an exchange program/ research internship ?
Answer: fall semester: may 15
spring semester:  october 15 
nomination of exchange candidates by their home institution: at least 15 days 

  similar_questions = model.docvecs.most_similar([new_question_embedding],topn=1)


#Testing with Questions outside Dataset

In [107]:
question1 = "what are deadline ?"
q = preprocess_text(question1)
new_question_embedding = model.infer_vector(q)

similar_questions = model.dv.most_similar([new_question_embedding],topn=1)

print(f"Similar questions for question : " + question1)
for index, similarity in similar_questions:
    print("Similarity:", str(round(similarity*100,1)) + "%")
    print("Question:", df_en['Question'][index])
    print("Answer:", df_en['Answer'][index])
    print()

Similar questions for question : what are deadline ?
Similarity: 98.8%
Question: What be the deadline to apply for AN exchange program/ research internship ?
Answer: fall semester: may 15
spring semester:  october 15 
nomination of exchange candidates by their home institution: at least 15 days prior to the deadline.



#Language Detection

In [None]:
import langdetect
from langdetect import detect

text1 = "This is an example of a text in English."
text2 = "Ceci est dans une autre langue."
# Detect the language of the text
language1 = detect(text1)
language2 = detect(text2)

print("The language of the text is:", language1)
print("The language of the text is:", language2)

