In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer, LancasterStemmer
from bs4 import BeautifulSoup
import re
import unicodedata
import spacy
from textblob import Word
from gensim.models import Word2Vec

# Load NLTK resources and Spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')



import os
import pandas as pd
from zipfile import ZipFile

# Set your Kaggle username and API key
os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
os.environ['KAGGLE_KEY'] = "your_kaggle_api_key"

# Download dataset from Kaggle
!kaggle datasets download -d kaggle_username/dataset_name

# Unzip the dataset
with ZipFile('dataset_name.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Load data into a pandas DataFrame
df = pd.read_csv('data/job_summary.csv')


# Define methods dictionary for each stage
methods = {
    "preprocessing": {
        "remove_noise": [remove_noise_regex, remove_noise_html, remove_numbers, remove_punctuation],
        "lowercase": [to_lowercase, to_lowercase_spacy],
        "remove_stopwords": [remove_stopwords_nltk, remove_stopwords_spacy],
        "lemmatize": [lemmatize_nltk, lemmatize_spacy, lemmatize_textblob],
        "stem": [stem_porter, stem_snowball, stem_lancaster]
    },
    "feature_extraction": {
        "tf_idf": [tf_idf_feature_extraction, tf_idf_spacy],
        "bag_of_words": [bag_of_words_feature_extraction]
    },
    "similarity_detection": {
        "cosine_similarity": [cosine_similarity],
        "jaccard_similarity": [jaccard_similarity],
        "edit_distance": [edit_distance]
    }
}

# Measure time function
def measure_time(func, text_series):
    start_time = time.time()
    text_series.apply(func)
    return time.time() - start_time

# Preprocessing functions
def remove_noise_regex(text):
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text)
    return text

def remove_noise_html(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r'\s+', ' ', text)
    return text

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def to_lowercase(text):
    return text.lower()

def to_lowercase_spacy(text):
    doc = nlp(text)
    return ' '.join([token.text.lower() for token in doc])

def remove_stopwords_nltk(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def remove_stopwords_spacy(text):
    doc = nlp(text)
    filtered_words = [token.text for token in doc if not token.is_stop]
    return ' '.join(filtered_words)

def lemmatize_nltk(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def lemmatize_spacy(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_words)

def lemmatize_textblob(text):
    words = word_tokenize(text)
    lemmatized_words = [Word(word).lemmatize() for word in words]
    return ' '.join(lemmatized_words)

def stem_porter(text):
    words = word_tokenize(text)
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def stem_snowball(text):
    words = word_tokenize(text)
    snowball_stemmer = SnowballStemmer(language='english')
    stemmed_words = [snowball_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

def stem_lancaster(text):
    words = word_tokenize(text)
    lancaster_stemmer = LancasterStemmer()
    stemmed_words = [lancaster_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Feature extraction functions
def tf_idf_feature_extraction(text_data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text_data)
    return X, vectorizer

def tf_idf_spacy(text_data):
    processed_text = [to_lowercase_spacy(doc) for doc in text_data]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(processed_text)
    return X, vectorizer

def bag_of_words_feature_extraction(text_data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(text_data)
    return X, vectorizer

# Similarity detection functions
def cosine_similarity(X):
    return cosine_similarity(X)

def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

def edit_distance(str1, str2):
    return nltk.edit_distance(str1, str2)

# Evaluate methods using cross-validation
def evaluate_methods_cv(df, methods_dict, stage_name, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    best_methods = {}

    methods = methods_dict[stage_name]

    for method_name, funcs in methods.items():
        best_f1_score = -1
        best_func = None

        for func in funcs:
            f1_scores = []

            for train_index, test_index in kf.split(df):
                train_df = df.iloc[train_index]
                test_df = df.iloc[test_index]

                cleaned_texts = train_df['job_summary'].apply(func)

                # Example of feature extraction and similarity detection, adjust based on Our method
                feature_vectors, _ = best_feature_methods["feature_extraction"](cleaned_texts)
                similarity_matrix = best_similarity_methods["similarity_detection"](feature_vectors)

                # Evaluate similarity detection performance
                labels = np.zeros((len(test_df), len(test_df)))
                for i in range(len(test_df)):
                    for j in range(i+1, len(test_df)):
                        if similarity_matrix[i][j] >= threshold:
                            labels[i][j] = 1

                ground_truth = np.zeros((len(test_df), len(test_df)))  # Dummy for illustration, replace with actual labels

                f1 = f1_score(ground_truth.flatten(), labels.flatten())
                f1_scores.append(f1)

            avg_f1_score = np.mean(f1_scores)

            if avg_f1_score > best_f1_score:
                best_f1_score = avg_f1_score
                best_func = func

        best_methods[method_name] = best_func

    return best_methods

# Select best methods for each stage using cross-validation
best_preprocessing_methods = evaluate_methods_cv(df, methods, "preprocessing")
best_feature_extraction_methods = evaluate_methods_cv(df, methods, "feature_extraction")
best_similarity_detection_methods = evaluate_methods_cv(df, methods, "similarity_detection")

# Apply best methods to clean job summaries
def apply_best_methods(text, methods_dict):
    for stage_name, best_methods in methods_dict.items():
        for method_name, best_func in best_methods.items():
            text = best_func(text)
    return text

df['cleaned_summary'] = df['job_summary'].apply(lambda x: apply_best_methods(x, {
    "preprocessing": best_preprocessing_methods,
    "feature_extraction": best_feature_extraction_methods,
    "similarity_detection": best_similarity_detection_methods
}))

# Display results
print("Best methods selected using cross-validation for each stage:")
print("Preprocessing methods:", best_preprocessing_methods)
print("Feature extraction methods:", best_feature_extraction_methods)
print("Similarity detection methods:", best_similarity_detection_methods)
print()
print(df[['job_summary', 'cleaned_summary']].head())

# Example of Word2Vec embedding
def word2vec_embedding(df):
    tokenized_sentences = df['cleaned_summary'].apply(word_tokenize)
    model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    word_embedding = model.wv['engineer']  # Example of obtaining word embedding
    print("Embedding vector for 'engineer':", word_embedding)
    
    similarity = model.wv.similarity('engineer', 'developer')  # Example of calculating similarity
    print("Similarity between 'engineer' and 'developer':", similarity)

# Execute Word2Vec embedding
word2vec_embedding(df)
