In [None]:
import pandas as pd
from keybert import KeyBERT

pd.set_option('display.max_colwidth', None)

# python -m spacy download en_core_web_sm
# pip install keybert 

In [None]:
def remove_newlines_at_start_end(df):
    """
    Removes leading and trailing newlines from all columns of a pandas dataframe.
    """
    # for all string columns, remove leading and trailing newlines
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip('\n')
    return df

df = pd.read_csv('MS_IS_all_modules.csv')

def xlsx_to_csv(xlsx_file_path, csv_file_path):
    df = pd.read_excel(xlsx_file_path)
    print(df.head())
    df.shape
    df.to_csv(csv_file_path)


def extract_keywords_from_content(df):
    df = remove_newlines_at_start_end(df)
    kw_model = KeyBERT()
    df['keywords'] = df['Contents'].apply(lambda x: ', '.join(kw[0] for kw in kw_model.extract_keywords(x, keyphrase_ngram_range=(1,2),
                                        stop_words='english', 
                                        highlight=False,
                                        top_n=4)))
    return df


def extract_keywords_from_intended_learning_outcomes(df):
    df = remove_newlines_at_start_end(df)
    kw_model = KeyBERT()
    df['keywords_learning'] = df['Intended learning outcomes'].apply(lambda x: ', '.join(kw[0] for kw in kw_model.extract_keywords(x, keyphrase_ngram_range=(1,2),
                                        stop_words='english', 
                                        highlight=False,
                                        top_n=4)))
    return df


In [None]:
df = pd.read_csv('MS_IS_all_modules.csv')  

df = df.head(30)
df = extract_keywords_from_content(df)  
df = extract_keywords_from_intended_learning_outcomes(df)
df[['Module title', 'keywords', 'Contents', ]].head(30)

print("---------------SAME FOR INTENDED LEARNING OUTCOMES-------------------")

df = extract_keywords_from_intended_learning_outcomes(df)  # Pass the dataframe to the function
df[['Module title', 'keywords', 'Contents', 'keywords_learning', 'Intended learning outcomes']].head(30)

# Ansatz mit zugeordneten Schlagwörter (Ausblick --> erstmal verworfen) 

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def assign_categories(dataframe):
    # Define the predefined categories
    categories = {
        "Artificial Intelligence": ["AI", "Machine Learning", "Deep Learning", "Neuronal Nets"],
        "Strategy": ["Business Strategy", "Market Analysis", "Competitive Intelligence"],
        "Marketing": ["Digital Marketing", "Social Media Marketing", "Market Research"],
        "Optimization": ["Operations Optimization", "Process Improvement", "Supply Chain Optimization"],
        "Data Science": ["Data Analysis", "Data Mining", "Statistical Modeling"],
        "Software Engineering": ["Software Development", "Programming", "Web Development", ""],
        "Society and Ethics": ["Ethical Issues", "Social Impact", "Sustainability", "Privacy"],
        "Finance": ["Financial Analysis", "Financial Markets", "Accounting", "Financial Reporting"],
        "Management": ["Leadership", "Project Management", "Team Management", "Change Management"],
        "Communication": ["Presentation Skills", "Negotiation", "Stakeholder Management", "Conflict Management"],
        "Entrepreneurship": ["Business Development", "Innovation", "Startups", "Venture Capital"],
        "Blockchain": ["Distributed Ledger", "Smart Contracts", "Cryptocurrencies", "Decentralized Applications"],
        "Internet of Things": ["IoT Devices", "IoT Platforms", "IoT Security", "IoT Data Management"],
    }

    # Initialize the NLP model (spacy)
    nlp = spacy.load("en_core_web_sm")

    # Extract the names of predefined categories
    category_names = list(categories.keys())

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the predefined category names
    category_vectors = vectorizer.fit_transform(category_names)

    # Iterate over the rows in the DataFrame
    assigned_categories = []
    for index, row in dataframe.iterrows():
        # Extract the row name
        row_name = row["Module title"]

        # Calculate the similarity between the row name and predefined categories
        row_vector = vectorizer.transform([row_name])
        similarities = cosine_similarity(row_vector, category_vectors)[0]

        # Find the index of the most similar category
        max_index = similarities.argmax()

        # Assign the category based on the index, if scores are less than 0.75, assign "Other"
        if similarities[max_index] < 0.000008:
            assigned_category = "Other"
        else:
            assigned_category = category_names[max_index]
        

        # Add the assigned category to the list
        assigned_categories.append(assigned_category)

    # Add the assigned categories to the DataFrame
    dataframe["Assigned Category"] = assigned_categories

    return dataframe

# Example usage

df = pd.read_csv('MS_IS_all_modules.csv')

df = pd.DataFrame(df)
df_with_categories = assign_categories(df)
#print only the columns we need
df_with_categories = df_with_categories[['Module title', 'Assigned Category']]
print(df_with_categories)

# Ansatz über Zero-Shot-Classificator


In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
candidate_labels = [
    "Artificial Intelligence",
    "Strategy",
    "Marketing",
    "Optimization",
    "Data Science",
    "Software Engineering",
    "Society and Ethics",
    "Finance",
    "Management",
    "Communication",
    "Entrepreneurship",
    "Internet of Things",
]

In [None]:
df = pd.read_csv('MS_IS_all_modules.csv')

print(df['Module title'][5])
text = df['Module title'][5]
print(text)

output = classifier(text, candidate_labels, device=0)
df = pd.DataFrame({'label': output['labels'], 'score': output['scores']})
nr_of_results = 3
df = df.sort_values(by=['score'], ascending=False).head(nr_of_results)
print(df)