# Reguły asocjacyjne - rekomendacja technologii

## Załadowanie danych z pliku csv

In [14]:
import pandas as pd
import os

df = pd.read_csv(os.path.join('..', 'jobData.csv'), on_bad_lines='skip')
df.head()

Unnamed: 0,Position,Company,Experience,Salary,Used Technologies,Optional Technologies
0,Embedded Software Engineer,Fluke Corportaion,mid,13000 18000,"{'C': 'regular', 'C++': 'regular', 'Linux': 'r...",-
1,Azure Platform Engineer (AI Department),Procter & Gamble,mid,-,"{'Microsoft Azure': 'advanced', 'Python': 'adv...",-
2,.NET Developer,UN7,mid,20000 28000,{'.Net': 'advanced'},-
3,Spec. Projektant Robotyzacji Procesów,Credit Agricole Bank Polska S.A.,mid,-,"{'C#': 'regular', 'VB.Net': 'regular', 'Java':...",-
4,HT Functional Consultant with ERP Industry Sol...,Accenture,mid,-,"{'English': 'master', 'ERP': 'advanced', 'Prob...",-


## Przygotowanie danych

In [15]:
import ast

def extract_technologies(tech_str):
    """ Extract technologies from a string. Assumes dictionary strings or comma-separated strings. """
    try:
        # Attempt to parse the string as a dictionary and extract keys (technologies)
        tech_dict = ast.literal_eval(tech_str)
        if isinstance(tech_dict, dict):
            return list(tech_dict.keys())
    except Exception:
        pass

    # For non-dictionary strings, assume a comma-separated list of technologies
    if isinstance(tech_str, str):
        return [tech.strip() for tech in tech_str.split(',') if tech.strip()]

    # Return an empty list for non-parsable or non-string entries
    return []

# Apply the function to each entry in the 'Used Technologies' column
df['Parsed Used Technologies'] = df['Used Technologies'].apply(extract_technologies)

# Display the first few rows of extracted technologies for verification
df.head()


Unnamed: 0,Position,Company,Experience,Salary,Used Technologies,Optional Technologies,Parsed Used Technologies
0,Embedded Software Engineer,Fluke Corportaion,mid,13000 18000,"{'C': 'regular', 'C++': 'regular', 'Linux': 'r...",-,"[C, C++, Linux, SVN, Yocto, Qt]"
1,Azure Platform Engineer (AI Department),Procter & Gamble,mid,-,"{'Microsoft Azure': 'advanced', 'Python': 'adv...",-,"[Microsoft Azure, Python, DevOps, CI/CD, GitHub]"
2,.NET Developer,UN7,mid,20000 28000,{'.Net': 'advanced'},-,[.Net]
3,Spec. Projektant Robotyzacji Procesów,Credit Agricole Bank Polska S.A.,mid,-,"{'C#': 'regular', 'VB.Net': 'regular', 'Java':...",-,"[C#, VB.Net, Java, VBA]"
4,HT Functional Consultant with ERP Industry Sol...,Accenture,mid,-,"{'English': 'master', 'ERP': 'advanced', 'Prob...",-,"[English, ERP, Problem Solving, Communication ..."


In [16]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(df['Parsed Used Technologies']).transform(df['Parsed Used Technologies'])
df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()

Unnamed: 0,(B2),(C1),(NATIVE),.NET,.NET 7,.NET C#,.NET Core,.Net,17; Spring Boot,4G/5G,...,private cloud,proxy,pyTest,pytest,sap business one,stakeholder management,webpack,wireframing,xPON,z/OS
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Wyszukiwanie zbiorów częstych

In [17]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
display(frequent_itemsets.sort_values(by='support', ascending=False).head())

Unnamed: 0,support,itemsets
50,0.121923,(SQL)
19,0.10551,(Git)
25,0.104338,(Java)
44,0.086753,(Python)
17,0.071512,(Docker)


## Wyszukiwanie reguł asocjacyjnych

In [18]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
display(rules.sort_values(by='support', ascending=False).head())

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
13,(Kubernetes),(Docker),0.059789,0.071512,0.03517,0.588235,8.225651,0.030894,2.254899,0.934289
11,(Confluence),(Jira),0.038687,0.058617,0.031653,0.818182,13.958182,0.029385,5.177608,0.965718
12,(Jira),(Confluence),0.058617,0.038687,0.031653,0.54,13.958182,0.029385,2.089811,0.986163
7,(CSS),(HTML),0.03517,0.032825,0.029308,0.833333,25.386905,0.028154,5.803048,0.995626
8,(HTML),(CSS),0.032825,0.03517,0.029308,0.892857,25.386905,0.028154,9.00508,0.993212


## Rekomendacja

In [19]:
user_inputs = ['Docker']

# Filter rules where the antecedents contain any of the user inputs
# Note: the antecedents in the rules DataFrame are of type frozenset, so we need to check accordingly
filtered_rules = rules[rules['antecedents'].apply(lambda x: any(item in x for item in user_inputs))]

# Extract and rank recommended technologies (consequents)
# We can rank by 'confidence', 'lift', or other metrics depending on your preference
recommended_technologies = filtered_rules[['consequents', 'confidence', 'lift']].sort_values(by='confidence', ascending=False)

display(recommended_technologies)

Unnamed: 0,consequents,confidence,lift
42,(Java),1.0,9.58427
44,(Kubernetes),0.785714,13.141457
40,(Spring Framework),0.611111,23.694444
38,(Git),0.555556,5.265432
