In [1]:
import pandas as pd
import pdb
from mlxtend.preprocessing import TransactionEncoder

# Obtenemos los valores de cada columna
genre_names = pd.read_csv('films_dataset/categories.csv', sep='|', header=None, names=['category','category_id'])

# Declaramos los nombres de todas las columnas
df_col_names = ['title', 'release_date', 'release_video_date', 'url'] + list(genre_names['category'].values)
initial_df = pd.read_csv('films_dataset/films.csv', sep='|', header=None, names=df_col_names)

# Declaramos que el indice sea la columna titulo
initial_df.set_index("title", inplace=True)

# Convertimos los 1 y 0 en booleanos
df = initial_df[list(genre_names['category'].values)].astype(bool)

# Convertimos el DataFrame a una lista de transacciones
transactions = df.apply(lambda row: row[row].index.tolist(), axis=1).tolist()

# Instanciar el TransactionEncoder
encoder = TransactionEncoder().fit(transactions)

# One-hot encode las transacciones
onehot = encoder.transform(transactions)

# Convertimos a DataFrame
onehot_df = pd.DataFrame(onehot, columns=encoder.columns_)

# Calculamos el soporte (proporción de 1's por columna)
support = onehot_df.mean()

# pdb.set_trace()
supportRomance = onehot_df['Romance'].mean()
supportDrama = onehot_df['Drama'].mean()
supportAction = onehot_df['Action'].mean()

supportAdventure = onehot_df['Adventure'].mean()
supportCrime = onehot_df['Crime'].mean()

supportChildrens = onehot_df['Children\'s'].mean()

# Mostrar el soporte para cada genero
print("Soporte de cada género:")
print(support)

# pdb.set_trace()
#-----------------------------------------------
# total_transactions_that_appears_romance = onehot_df['Romance']



Soporte de cada género:
Action         0.149227
Adventure      0.080262
Animation      0.024970
Children's     0.072533
Comedy         0.300238
Crime          0.064804
Documentary    0.029727
Drama          0.431034
Fantasy        0.013080
Film-Noir      0.014269
Horror         0.054697
Musical        0.033294
Mystery        0.036266
Romance        0.146849
Sci-Fi         0.060048
Thriller       0.149227
War            0.042212
Western        0.016052
unknown        0.001189
dtype: float64


In [2]:
len(transactions)

1682

In [3]:
onehot_df['Romance'].value_counts()

False    1435
True      247
Name: Romance, dtype: int64

In [4]:
# Funciona manual
def calculate_support(column_name, total_transactions):
    total_transaction_column_name = onehot_df[column_name].value_counts().to_dict()[True]
    return total_transaction_column_name / total_transactions

In [11]:
def calculate_joint_support(columns, total_transactions):
    total_joint_transactions = onehot_df[columns].all(axis=1).sum()
    return total_joint_transactions / total_transactions

In [12]:
def aparicion_total_transactions(transactions, X, Y):
    transactions_X_and_Y = sum(1 for transaction in transactions if X in transaction and Y in transaction)
    return transactions_X_and_Y

In [13]:
aparicion_total_transactions(transactions, 'Romance', 'Drama')

99

In [14]:
aparicion_total_transactions(transactions, 'Romance', 'Drama') / len(transactions)

0.058858501783590964

In [15]:
calculate_support(['Romance'], len(transactions))

8.730617675294543e-05

In [16]:
# Ejemplo de calcular el soporte
total_transaction_romace = onehot_df['Romance'].value_counts().to_dict()[True]
supportR = total_transaction_romace / len(onehot_df['Romance'])
supportR

0.14684898929845422

In [17]:
total_transaction_drama = onehot_df['Drama'].value_counts().to_dict()[True]
support = total_transaction_drama / len(onehot_df['Drama'])
support

0.43103448275862066

In [18]:
support_romance_drama = (total_transaction_drama + total_transaction_romace) / 1682 
support_romance_drama

0.5778834720570749

In [21]:
support_romance_drama = onehot_df[["Romance", "Drama"]].all(axis=1).mean()
support_romance_drama


0.058858501783590964

In [None]:
calculate_joint_support(['Romance', 'Drama'], len(transactions))

In [22]:
onehot_df[["Romance", "Drama"]].all(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
1677    False
1678    False
1679     True
1680    False
1681    False
Length: 1682, dtype: bool