In [6]:
import pandas as pd
import json

cat = pd.read_csv("DATA/categories.csv")
cat

Unnamed: 0,Theme,Class
0,Introduction,A
1,(Computational) Geometry,B
2,String Processing,C
3,Graph,D
4,Data Structures and Libraries,E
5,Mathematics,F
6,Rare Topics,G
7,More Advanced Topics,H
8,Problem Solving Paradigms,I


In [1]:
data = {"username":[], "classes":[]}
# Opening JSON file
with open('DATA/user_categories.json', 'r') as openfile:
    json_object = json.load(openfile)
    for user, subs in json_object:
        data["username"].append(user)
        data["classes"].append(subs)

df = pd.DataFrame(data)
df        

Unnamed: 0,username,classes
0,jkcc,F F I A
1,wsxmm1153,F G G D
2,ksax,B B G G I B B B B B E F F F B B D
3,oddur09,A D F F
4,harryapotter,D C F F B I B B D F D A G
...,...,...
71967,nparrado,F F G D C B D D F B H B C B D F G
71968,Shampoo,F F F F F F F F F F G G G G D D D D D D D D D ...
71969,gaston770,F D A H D D A I I I I A I I I I A B E I C C D ...
71970,mandycyt,D B I C D B


# Apriori

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split space-separated classes into lists
df["classes"] = df["classes"].str.split()

# Create binary columns for each class
mlb = MultiLabelBinarizer()
encoded_data = mlb.fit_transform(df["classes"])
encoded_df = pd.DataFrame(encoded_data, columns=mlb.classes_)
encoded_df

Unnamed: 0,A,B,C,D,E,F,G,H,I
0,1,0,0,0,0,1,0,0,1
1,0,0,0,1,0,1,1,0,0
2,0,1,0,1,1,1,1,0,1
3,1,0,0,1,0,1,0,0,0
4,1,1,1,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
71967,0,1,1,1,0,1,1,1,0
71968,1,1,1,1,1,1,1,1,1
71969,1,1,1,1,1,1,1,1,1
71970,0,1,1,1,0,0,0,0,1


In [3]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(encoded_df, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)



In [4]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(B),(A),0.693659,0.452593,0.394834,0.569205,1.257654,0.080889,1.270692,0.668761
1,(A),(B),0.452593,0.693659,0.394834,0.872383,1.257654,0.080889,2.400472,0.374253
2,(C),(A),0.430112,0.452593,0.300978,0.699767,1.546131,0.106313,1.823278,0.619813
3,(A),(C),0.452593,0.430112,0.300978,0.665009,1.546131,0.106313,1.701204,0.645267
4,(D),(A),0.574446,0.452593,0.365337,0.635981,1.405195,0.105347,1.503788,0.677599
...,...,...,...,...,...,...,...,...,...,...
10143,(F),"(D, C, B, H, A, I, G)",0.661063,0.221336,0.216154,0.326979,1.477296,0.069836,1.156968,0.953237
10144,(H),"(D, C, B, F, A, I, G)",0.438573,0.229728,0.216154,0.492856,2.145387,0.115401,1.518842,0.950941
10145,(A),"(D, C, B, F, H, I, G)",0.452593,0.231521,0.216154,0.477589,2.062838,0.111369,1.471026,0.941220
10146,(I),"(D, C, B, F, H, A, G)",0.469988,0.225741,0.216154,0.459912,2.037350,0.110058,1.433581,0.960670


# WORD2VEC

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["username"], df["classes"], test_size=0.33, random_state=42)

In [7]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

#"F F I A"
sentences = [" ".join(s).strip() for s in y_train]

#["F", "F", "I", "A"]
tokenized_sentences = [sentence.split() for sentence in sentences]

model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, sg=0, min_count=1, workers=4)

model.save("word2vec_cbow_model")

loaded_model = Word2Vec.load("word2vec_cbow_model")

# Pode vir a ajudar na avaliação de scoring
for c in cat["Class"]:
    similar_words = loaded_model.wv.most_similar(c, topn=3)
    print("Similar words to {}:".format(cat[cat['Class'] == c].iloc[0, 0]), "; ".join([cat[cat['Class'] == w[0]].iloc[0, 0] for w in similar_words]))

Similar words to Introduction: Data Structures and Libraries; Problem Solving Paradigms; String Processing
Similar words to (Computational) Geometry: More Advanced Topics; Data Structures and Libraries; Rare Topics
Similar words to String Processing: Graph; Data Structures and Libraries; Rare Topics
Similar words to Graph: String Processing; Data Structures and Libraries; More Advanced Topics
Similar words to Data Structures and Libraries: Problem Solving Paradigms; More Advanced Topics; Mathematics
Similar words to Mathematics: Data Structures and Libraries; String Processing; Graph
Similar words to Rare Topics: String Processing; Introduction; More Advanced Topics
Similar words to More Advanced Topics: Problem Solving Paradigms; Data Structures and Libraries; (Computational) Geometry
Similar words to Problem Solving Paradigms: Data Structures and Libraries; More Advanced Topics; Introduction
