## Importaciones

In [137]:
import pandas as pd
import math
import numpy as np
from random import randint
import hashlib
import re

from tqdm import tqdm

## Hiperparámetros

In [153]:
shingling_size = 10
signature_size = 50
bands_nr = 10
threshold = 0.7
upper_threshold = 0.95
user_thereshold = 3

total_tweets = 0.005

## Cargar datos y sacar muestra

In [139]:
req_cols = ['id','screen_name','text']
row_num = math.ceil(total_tweets * 4594980)  # 20% de los datos

# Abrir el archivo CSV y procesarlo línea por línea
tweets_df = pd.read_csv("tweets_2022_abril_junio.csv", usecols=req_cols, nrows=row_num)

tweets_df.columns

Index(['id', 'screen_name', 'text'], dtype='object')

In [140]:
doc_nr = len(tweets_df)
doc_nr

22975

In [141]:
for index, row in tqdm(tweets_df.iterrows()):
    text = row['text'].replace('\n', '').replace('\r', '')
    text = re.sub(r'^RT\s+@\w+:\s+', '',text).lower()
    text = re.sub(r'@\w+', '', text)
    text = text.lstrip(' ').rstrip(' ')
    tweets_df.at[index, 'text'] = text

22975it [00:01, 11536.15it/s]


In [142]:
tweets_df.head()

Unnamed: 0,id,screen_name,text
0,1512186166438637582,h0l4d4ni3l4,"tras casi 50 años del golpe, la constitución s..."
1,1512186202367045642,Claudio70932894,mañana jueves a las 18hrs. comienza nuestro pr...
2,1512186287284924418,Cesar_A_RR,aquí está el aporte de con respecto a los der...
3,1512186335754301446,rosmarieher,la pelotudez no tiene limites...no tiene
4,1512186407841767424,GQuelluen,"ante la circulación de noticias falsas, les qu..."


## Obtener Shingles por tweet y todos los shingles

In [143]:
k = 5 ## Largo de los shingles
tweets_df["shingles"] = [set([tweet[i:i+k] for i in range(len(tweet) - k + 1)]) for tweet in tqdm(tweets_df["text"])]

100%|██████████| 22975/22975 [00:00<00:00, 44216.98it/s]


## Similitud de Jaccard

## Funciones de Hash

In [144]:
class hashFamily:
    def __init__(self, i):
        self.resultSize = 8 # how many bytes we want back
        self.maxLen = 20 # how long can our i be (in decimal)
        self.salt = str(i).zfill(self.maxLen)[-self.maxLen:]
        
    def get_hash_value(self, el_to_hash):
        return int(hashlib.sha1(str(el_to_hash).encode('utf-8') + self.salt.encode('utf-8')).hexdigest()[-self.resultSize:], 16)

## Calculamos el minhash de los tweets

In [145]:
class minhashSigner:
    def __init__(self, sig_size):
        self.sig_size=sig_size
        self.hash_functions = [hashFamily(randint(0,10000000000)) for i in range(0,sig_size)]
    
    def compute_set_signature(self, set_):
        set_sig = []
        for h_funct in self.hash_functions:
            min_hash = math.inf
            for el in set_:
                h = h_funct.get_hash_value(el)
                if h < min_hash:
                    min_hash = h
                
            set_sig.append(min_hash)
        
        return set_sig
    
    #return a list of lists that can be seen as the signature matrix
    def compute_signature_matrix(self, set_list):
        signatures = []
        for s in tqdm(set_list):
            signatures.append( self.compute_set_signature(s) )
            
        return signatures

In [146]:
signature_size = 50
shingling_list = tweets_df["shingles"]
signer = minhashSigner(signature_size)
signature_matrix = signer.compute_signature_matrix( shingling_list )

100%|██████████| 22975/22975 [02:18<00:00, 166.21it/s]


## Almacenamos los minhash para ahorrar tiempo

In [147]:
np.savetxt('minhash.txt', signature_matrix)

## LSH (Locality Sensitive Hashing)

In [148]:
class lsh:
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        
    def get_signature_matrix_bands(self, sig_matrix, bands_nr, sign_len): 
        #bands_nr = b
        #sign_len = n
        r = int(sign_len/bands_nr) #number of rows in each band
        bands = {} # {band_nr: [col_1,col_2,...]} where col_1 is all the values of Sig(S_i) for band b.
        for i in range(0,bands_nr):
            bands[i] = []
        
        # put Subsets of the columns of signature matrix into the appropriate bucket and cosider a column 
        # as a unique block so that we can hash the entire column.
        # Basically a band is a list of element, where each element is a subset of a signature of a given set.
        for signature in sig_matrix: 
            
            for i in range(0, bands_nr):
                idx = i*r    
                bands[i].append(' '.join(str(x) for x in signature[idx:idx+r]) ) 
                    
        return bands

    #band is a list 
    # construct a dictionary {hash(band_column): doc_id that produced this hash}
    def get_band_buckets(self, band, hash_funct):
        buckets = {}
        for doc_id in range(0,len(band)):
            value = hash_funct.get_hash_value( band[doc_id] )
            if value not in buckets:
                buckets[value] = [doc_id]
            else:
                 buckets[value].append(doc_id)
                
        return buckets
    
    def get_candidates_list(self, buckets):
        candidates = set()
        # buckets is a dictionary containing key=bucket, value= list of doc_ids that hashed to bucket
        for bucket,candidate_list in buckets.items():
            if len(candidate_list) > 1:
                for i in range(0,len(candidate_list)-1):
                    for j in range(i+1,len(candidate_list)):  
                        pair = tuple(sorted( (candidate_list[i],candidate_list[j]) ))
                        candidates.add(pair)
                
        return candidates #ie a set of couples, each couple is a candidate pair
    
    def check_candidates(self, candidates_list, threshold, sigs):
        similar_docs = set() #set of tuples
        # similar_pair is a couple containing doc_ids of documents that hashed to same bucket
        for  similar_pair in candidates_list:
            #for all the pairs of document in the list check similarity of their signatures
            doc_id_1 = similar_pair[0]
            doc_id_2 = similar_pair[1]
            signature_1 = set(sigs[doc_id_1]) #get the i-th column from signature matrix where i is doc_id in the collision list
            signature_2 = set(sigs[doc_id_2])
            js = len(signature_1.intersection(signature_2)) /len(signature_1.union(signature_2))
            
            if js >= threshold and js < upper_threshold:
                similar_docs.add( tuple(sorted((doc_id_1,doc_id_2) )) )
                        
                        
        return similar_docs
    
    def get_similar_items(self, sig_matrix, bands_nr, sign_len):
        similar_docs = set()
        #divide signature matrix into bands
        bands = self.get_signature_matrix_bands(sig_matrix,bands_nr,sign_len)
        
        #for all the bands
        for band_id, elements in tqdm(bands.items()):
            #produce the buckets for the given band (band_id) with a random hash function
            buckets = self.get_band_buckets(elements, hash_funct=hashFamily(randint(0,10000000000)))
            #Get all the candidate pairs
            candidates = self.get_candidates_list(buckets)
            #Check all candidate pairs' signatures
            for sim_tuple in self.check_candidates(candidates, self.threshold, sig_matrix):
                similar_docs.add( sim_tuple)

        return similar_docs #return all the similar signatures that respect the threshold

In [149]:
lsh_instance = lsh(threshold)
lsh_similar_itemset = lsh_instance.get_similar_items(signature_matrix, bands_nr, signature_size)

100%|██████████| 10/10 [01:16<00:00,  7.61s/it]


In [150]:
user_candidates = dict()
tweets_candidates = dict()

for i in tqdm(range(len(lsh_similar_itemset))):    
    docs = lsh_similar_itemset.pop()
    tweet1_name = tweets_df.iloc[docs[0]]["screen_name"]
    tweet2_name = tweets_df.iloc[docs[1]]["screen_name"]
    tweet1_text = tweets_df.iloc[docs[0]]["text"]
    tweet2_text = tweets_df.iloc[docs[1]]["text"]
    names = tuple(sorted((tweet1_name,tweet2_name)))
    if tweet1_name != tweet2_name:
        if names not in user_candidates.keys():
            user_candidates[names] = 1
            tweets_candidates[names] = [[tweet1_text],[tweet2_text]]
        else:
            user_candidates[names] += 1
            tweets_candidates[names][0].append(tweet1_text)
            tweets_candidates[names][1].append(tweet2_text)

100%|██████████| 914/914 [00:00<00:00, 3487.13it/s]


In [151]:
print(len(user_candidates))

824


In [154]:
for i in user_candidates:
    if user_candidates[i] >= user_thereshold:
        print(f"Evaluando usuarios {i}, con {user_candidates[i]} tweets similares")
        for j in range(user_candidates[i]):
            print(f"Tweet {j + 1} del Usuario: {i[0]}")
            print(tweets_candidates[i][0][j])
            print(f"Tweet {j + 1} del Usuario: {i[1]}")
            print(tweets_candidates[i][1][j])
        print("-----------------------------------------")
        break

Evaluando usuarios ('Masryaalbi', 'libertarioconte'), con 23 tweets similares
Tweet 1 del Usuario: Masryaalbi
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvl…
Tweet 1 del Usuario: libertarioconte
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvlohy
Tweet 2 del Usuario: Masryaalbi
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvl…
Tweet 2 del Usuario: libertarioconte
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvlohy
Tweet 3 del Usuario: Masryaalbi
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvl…
Tweet 3 del Usuario: libertarioconte
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvlohy
Tweet 4 del Usuario: Masryaalbi
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvl…
Tweet 4 del Usuario: libertarioconte
destruye en segundos al chanta de baradit!!! rt  https://t.co/gcxixvlohy
Tweet 5 del Usuario: Masryaalbi
destruye en segundos al chanta de bara