# CONSEGNA SPAM DETECTION 

## PROGETTO SPAM DETECTION
### L'azienda ProfessionAI vuole realizzare una libreria capace di fare analisi delle email ricevute. 
### Nello specifico, il CEO ha richiesto di identificare le email di tipo SPAM sulle quali fare analisi contenutistiche.
### Il CTO nello specifico ti fornisce un dataset e ti chiede di:
### - Addestrare un classificatore per identificare SPAM
### - Individuare i Topic principali tra le email SPAM presenti nel dataset
### - Calcolare la distanza semantica tra i topics ottenuti, per dedurne l'eterogeneità.
### - Estrarre dalle mail NON SPAM le Organizzazioni presenti.

In [239]:
import string
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
from sklearn.neural_network import MLPClassifier
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora 
import gensim
from pprint import pprint
import gensim.downloader
from scipy import spatial
import numpy as np
import spacy
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\monte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
df = pd.read_csv('datasets/Verifica Finale - Spam Detection/spam_dataset.csv')

In [38]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\nth...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,3624,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\nthis deal is to ...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\nthe transport v...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\nhpl ...,0
5168,2933,ham,Subject: calpine daily gas nomination\n>\n>\nj...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


# PROGETTO

### Data Quality e Analisi Esplorativa

In [39]:
#Rimuovere index dalle colonne e drop duplicati
df = df.drop(columns=['Unnamed: 0'])
df = df.drop_duplicates()


In [40]:
#Controllo per mancate valorizzazioni
null_counts = df.isnull().sum()
print(null_counts)
#Value Counts per controllare correttezza/coerenza colonne
print(df.label.value_counts())
print(df.label_num.value_counts())

label        0
text         0
label_num    0
dtype: int64
label
ham     3531
spam    1462
Name: count, dtype: int64
label_num
0    3531
1    1462
Name: count, dtype: int64


In [41]:
#Analisi esplorativa EDA - distribuzione label 
len(df)
labels = df[['label_num']]
data = df.drop(['label_num', 'label'], axis=1)
print(data.text[4]) #per leggere testi

Subject: re : indian springs
this deal is to book the teco pvr revenue . it is my understanding that teco
just sends us a check , i haven ' t received an answer as to whether there is a
predermined price associated with this deal or if teco just lets us know what
we are giving . i can continue to chase this deal down if you need .


In [42]:
#Train Test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
print(len(X_train))
print(len(X_test))

#Controllare che lo split sia rappresentativo delle due classi tra train e test set
print(len(y_train[y_train['label_num']==1])/len(y_train)) 
print(len(y_test[y_test['label_num']==1])/len(y_test)) 


3495
1498
0.29871244635193134
0.27903871829105475


### Funzioni pronte per cleaning e vettorizzazione

In [270]:
#Data Cleaning
english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(dataset):
    dataset_to_return = []
    for sentence in dataset:
        sentence = sentence.lower()
        for c in string.punctuation:
            sentence = sentence.replace(c, " ")
        document = nlp(sentence)
        sentence = ' '.join(token.lemma_ for token in document)
        sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
        sentence = re.sub('\d', '', sentence)
        dataset_to_return.append(sentence)

    return dataset_to_return

#Text vectorization
def bow_tfidf(dataset, tfidf_vectorizer):
    if tfidf_vectorizer == None:
        tfidf_vectorizer = TfidfVectorizer()
        X = tfidf_vectorizer.fit_transform(dataset)
    else:
        X = tfidf_vectorizer.transform(dataset)
        
    return X.toarray(), tfidf_vectorizer

In [44]:
training_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(X_train['text']), None)
test_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(X_test['text']), tfidf_vectorizer)

In [45]:
# Verifica se le lunghezze coincidono
print(f"Lunghezza del dataset pulito: {len(training_data_cleaned)}")
print(f"Lunghezza delle etichette: {len(y_train)}")
#Verifica formato dati 
print(type(training_data_cleaned))  # array
print(type(y_train.values))         # array

Lunghezza del dataset pulito: 3495
Lunghezza delle etichette: 3495
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


### Selezione modello di classificazione

In [46]:
# Creazione di un modello di regressione logistica con cross validation
clf = LogisticRegression()
cv_scores = cross_val_score(clf, training_data_cleaned, y_train.values, cv=5, scoring=make_scorer(f1_score))
print(f"F1 Score per fold: {cv_scores}")
print(f"F1 Score medio: {cv_scores.mean()}")

#Test
clf.fit(training_data_cleaned, y_train)
test_scores = f1_score(y_test, clf.predict(test_data_cleaned))
print(f"F1 Score sul test set: {test_scores}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


F1 Score per fold: [0.96941176 0.9787234  0.96926714 0.96728972 0.97882353]
F1 Score medio: 0.9727031114958079


  y = column_or_1d(y, warn=True)


F1 Score sul test set: 0.9704840613931524


In [47]:
# Creazione di un Perceptron con cross validation
clf = MLPClassifier(activation='logistic',
                    hidden_layer_sizes=(100,),
                    max_iter=100,
                    solver='adam',
                    tol=0.005,
                    verbose=True)
cv_scores = cross_val_score(clf, training_data_cleaned, y_train.values, cv=5, scoring=make_scorer(f1_score))
print(f"F1 Score per fold: {cv_scores}")
print(f"F1 Score medio: {cv_scores.mean()}")

#Test
clf.fit(training_data_cleaned, y_train.values)
y_pred = clf.predict(test_data_cleaned)
f1_test = f1_score(y_test.values, y_pred)



  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.68767570
Iteration 2, loss = 0.58565099
Iteration 3, loss = 0.55979907
Iteration 4, loss = 0.53861913
Iteration 5, loss = 0.51543154
Iteration 6, loss = 0.49151955
Iteration 7, loss = 0.46557156
Iteration 8, loss = 0.43744365
Iteration 9, loss = 0.40712428
Iteration 10, loss = 0.37529915
Iteration 11, loss = 0.34276742
Iteration 12, loss = 0.31048381
Iteration 13, loss = 0.27936636
Iteration 14, loss = 0.25024234
Iteration 15, loss = 0.22357707
Iteration 16, loss = 0.19988390
Iteration 17, loss = 0.17855144
Iteration 18, loss = 0.16007808
Iteration 19, loss = 0.14385213
Iteration 20, loss = 0.12969701
Iteration 21, loss = 0.11746482
Iteration 22, loss = 0.10679181
Iteration 23, loss = 0.09741792
Iteration 24, loss = 0.08923744
Iteration 25, loss = 0.08203392
Iteration 26, loss = 0.07570760
Iteration 27, loss = 0.07016984
Iteration 28, loss = 0.06517143
Iteration 29, loss = 0.06078799
Iteration 30, loss = 0.05684209
Iteration 31, loss = 0.05329040
Iteration 32, los

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.61799843
Iteration 2, loss = 0.57620765
Iteration 3, loss = 0.55124110
Iteration 4, loss = 0.52476434
Iteration 5, loss = 0.49630328
Iteration 6, loss = 0.46445394
Iteration 7, loss = 0.42933429
Iteration 8, loss = 0.39142772
Iteration 9, loss = 0.35190256
Iteration 10, loss = 0.31266076
Iteration 11, loss = 0.27527519
Iteration 12, loss = 0.24093548
Iteration 13, loss = 0.21057198
Iteration 14, loss = 0.18437899
Iteration 15, loss = 0.16183618
Iteration 16, loss = 0.14264011
Iteration 17, loss = 0.12648588
Iteration 18, loss = 0.11274994
Iteration 19, loss = 0.10105372
Iteration 20, loss = 0.09114583
Iteration 21, loss = 0.08248488
Iteration 22, loss = 0.07519222
Iteration 23, loss = 0.06873830
Iteration 24, loss = 0.06317739
Iteration 25, loss = 0.05834200
Iteration 26, loss = 0.05401731
Iteration 27, loss = 0.05024381
Iteration 28, loss = 0.04690919
Iteration 29, loss = 0.04389481
Iteration 30, loss = 0.04123866
Iteration 31, loss = 0.03883690
Iteration 32, los

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.71517505
Iteration 2, loss = 0.58996497
Iteration 3, loss = 0.55553909
Iteration 4, loss = 0.53288605
Iteration 5, loss = 0.50844255
Iteration 6, loss = 0.48410315
Iteration 7, loss = 0.45847402
Iteration 8, loss = 0.43158224
Iteration 9, loss = 0.40303708
Iteration 10, loss = 0.37354561
Iteration 11, loss = 0.34348374
Iteration 12, loss = 0.31351426
Iteration 13, loss = 0.28457289
Iteration 14, loss = 0.25707743
Iteration 15, loss = 0.23154595
Iteration 16, loss = 0.20834063
Iteration 17, loss = 0.18742938
Iteration 18, loss = 0.16876944
Iteration 19, loss = 0.15238519
Iteration 20, loss = 0.13790804
Iteration 21, loss = 0.12513060
Iteration 22, loss = 0.11393933
Iteration 23, loss = 0.10416294
Iteration 24, loss = 0.09553030
Iteration 25, loss = 0.08785388
Iteration 26, loss = 0.08119355
Iteration 27, loss = 0.07523146
Iteration 28, loss = 0.06989637
Iteration 29, loss = 0.06514110
Iteration 30, loss = 0.06093274
Iteration 31, loss = 0.05714847
Iteration 32, los

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.65877146
Iteration 2, loss = 0.58397405
Iteration 3, loss = 0.55992463
Iteration 4, loss = 0.53628644
Iteration 5, loss = 0.51290457
Iteration 6, loss = 0.48773624
Iteration 7, loss = 0.45984542
Iteration 8, loss = 0.42939764
Iteration 9, loss = 0.39697058
Iteration 10, loss = 0.36268262
Iteration 11, loss = 0.32802311
Iteration 12, loss = 0.29450692
Iteration 13, loss = 0.26239250
Iteration 14, loss = 0.23318457
Iteration 15, loss = 0.20692765
Iteration 16, loss = 0.18369109
Iteration 17, loss = 0.16351493
Iteration 18, loss = 0.14613334
Iteration 19, loss = 0.13104432
Iteration 20, loss = 0.11802336
Iteration 21, loss = 0.10676237
Iteration 22, loss = 0.09709022
Iteration 23, loss = 0.08864749
Iteration 24, loss = 0.08122620
Iteration 25, loss = 0.07474459
Iteration 26, loss = 0.06904881
Iteration 27, loss = 0.06403588
Iteration 28, loss = 0.05959096
Iteration 29, loss = 0.05558768
Iteration 30, loss = 0.05209973
Iteration 31, loss = 0.04890757
Iteration 32, los

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.60783923
Iteration 2, loss = 0.57648366
Iteration 3, loss = 0.55012380
Iteration 4, loss = 0.52353496
Iteration 5, loss = 0.49234720
Iteration 6, loss = 0.45774123
Iteration 7, loss = 0.41957818
Iteration 8, loss = 0.37882640
Iteration 9, loss = 0.33753078
Iteration 10, loss = 0.29791476
Iteration 11, loss = 0.26039744
Iteration 12, loss = 0.22742339
Iteration 13, loss = 0.19870128
Iteration 14, loss = 0.17387967
Iteration 15, loss = 0.15305919
Iteration 16, loss = 0.13524950
Iteration 17, loss = 0.12022157
Iteration 18, loss = 0.10755760
Iteration 19, loss = 0.09664030
Iteration 20, loss = 0.08728576
Iteration 21, loss = 0.07930085
Iteration 22, loss = 0.07237349
Iteration 23, loss = 0.06638542
Iteration 24, loss = 0.06109021
Iteration 25, loss = 0.05644773
Iteration 26, loss = 0.05236703
Iteration 27, loss = 0.04877893
Iteration 28, loss = 0.04551592
Iteration 29, loss = 0.04264190
Iteration 30, loss = 0.04010699
Iteration 31, loss = 0.03778156
Iteration 32, los

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.63928869
Iteration 2, loss = 0.56875593
Iteration 3, loss = 0.53989203
Iteration 4, loss = 0.50830801
Iteration 5, loss = 0.47499848
Iteration 6, loss = 0.43797035
Iteration 7, loss = 0.39809673
Iteration 8, loss = 0.35638179
Iteration 9, loss = 0.31452380
Iteration 10, loss = 0.27455982
Iteration 11, loss = 0.23824836
Iteration 12, loss = 0.20633821
Iteration 13, loss = 0.17899585
Iteration 14, loss = 0.15586742
Iteration 15, loss = 0.13647761
Iteration 16, loss = 0.12025210
Iteration 17, loss = 0.10643300
Iteration 18, loss = 0.09493585
Iteration 19, loss = 0.08515730
Iteration 20, loss = 0.07692992
Iteration 21, loss = 0.06978237
Iteration 22, loss = 0.06374083
Iteration 23, loss = 0.05849763
Iteration 24, loss = 0.05388716
Iteration 25, loss = 0.04993057
Iteration 26, loss = 0.04638532
Iteration 27, loss = 0.04329571
Iteration 28, loss = 0.04053447
Iteration 29, loss = 0.03808286
Iteration 30, loss = 0.03589122
Iteration 31, loss = 0.03392767
Iteration 32, los

modello migliore LogisticRegression in quanto:
F1 Score medio LogisticRegression: 0.9705 > F1 Score medio Perceptron: 0.9700 

### Risultati Spam Detection

In [221]:
# Salva gli indici originali del test set
original_indices = X_test.index

#Miglior modello trovato
clf = LogisticRegression()
clf.fit(training_data_cleaned, y_train)
test_scores = f1_score(y_test, clf.predict(test_data_cleaned))
y_pred = clf.predict(test_data_cleaned)
print(f"F1 Score sul test set: {test_scores}")

# Crea un dataframe con gli indici originali e le previsioni
predictions_with_index = pd.DataFrame({
    'index': original_indices,
    'prediction': y_pred
})

# Risultati
predictions_with_index.head(10) # Visualizza le prime righe con gli indici originali e le previsioni

  y = column_or_1d(y, warn=True)


F1 Score sul test set: 0.9704840613931524


Unnamed: 0,index,prediction
0,3252,0
1,3177,0
2,629,0
3,4155,0
4,980,0
5,811,0
6,4022,1
7,3734,0
8,1454,0
9,3782,0


In [222]:
predictions_with_index = predictions_with_index[predictions_with_index['prediction'] == 1]
spam_df = predictions_with_index.merge(X_test, left_on='index', right_index=True)

In [223]:
spam_df #db con testo originale e previsione se spam o meno

Unnamed: 0,index,prediction,text
6,4022,1,Subject: all me ^ ds here paliourg\nuser id : ...
11,5015,1,Subject: highest gains without guesswork\nwysa...
14,229,1,Subject: registration welcome email\nthank you...
35,1550,1,Subject: better than viagra\nhi !\nwe have a n...
36,4998,1,Subject: sparkasse security issue\nsehr geehrt...
...,...,...,...
1484,2661,1,Subject: re : encrypted mail\nfor more details...
1491,4571,1,Subject: hi paliourg get all pills . everythin...
1492,1159,1,Subject: welcome to the next wave in digital m...
1493,2097,1,"Subject: patricia , unthinking respect for aut..."


### Individuazione topic principali

In [193]:
#Funzioni di pulizia e preparazione
def sent_to_words(items):
    for item in items:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(item), deacc=True))

def remove_stopwords(texts):
    return [[word for word in words if word not in stop_words and len(word) >3] for words in texts] 

stop_words = stopwords.words('english')
stop_words.extend(['subject', 'please', 'hello', 'thanks', 'http'])

data_words = list(sent_to_words(spam_df.text))
data_words = remove_stopwords(data_words)

In [194]:
len(data_words)

429

In [195]:
# Crea dizionario
id2word = corpora.Dictionary(data_words)
id2word.filter_extremes(no_below=2, no_above=0.2) #parola deve essere presente in almeno 2 mail e non presente in oltre il 20% del dataset
# Term Document Frequency
corpus = [id2word.doc2bow(i) for i in data_words]

In [205]:
num_topics = 5

# LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       passes = 5)
pprint(lda_model.print_topics(num_words=5))
doc_lda = lda_model[corpus]

#Identificare le parole singole senza frequenza
topics_keywords = []
for topic_id, topic in lda_model.print_topics(num_topics=num_topics, num_words=5):
    keywords = [word.split('*')[1].strip().replace('"', '') for word in topic.split('+')]
    topics_keywords.append(keywords)

doc_lda = lda_model[corpus]

[(0,
  '0.018*"company" + 0.014*"statements" + 0.009*"information" + '
  '0.008*"securities" + 0.008*"within"'),
 (1,
  '0.006*"account" + 0.005*"price" + 0.005*"online" + 0.005*"microsoft" + '
  '0.005*"windows"'),
 (2,
  '0.053*"nbsp" + 0.006*"email" + 0.005*"million" + 0.005*"company" + '
  '0.004*"voip"'),
 (3,
  '0.016*"font" + 0.012*"pills" + 0.009*"color" + 0.007*"align" + '
  '0.007*"size"'),
 (4,
  '0.009*"computron" + 0.008*"email" + 0.007*"free" + 0.007*"contact" + '
  '0.006*"epson"')]


In [206]:
topics_keywords #5 topic su cui vedere eterogeneità

[['company', 'statements', 'information', 'securities', 'within'],
 ['account', 'price', 'online', 'microsoft', 'windows'],
 ['nbsp', 'email', 'million', 'company', 'voip'],
 ['font', 'pills', 'color', 'align', 'size'],
 ['computron', 'email', 'free', 'contact', 'epson']]

### Test su spam nuovi (3 esempi)

In [207]:
unseen_document1 = "CONGRATULATION! Dear Email Owner: We happily announced that your E-mail Address has been selected among the winners of the Mercedes Benz International Online Lottery Draw 2024 promo. You are now a winner of a brand new Mercedes Benz c300 4matic 2024 and the grand prize of $1,500,000.00 USD. For easy claim of your winnings prize, you are simply advised to contact our Fiduciary Claim Agent in Texas Mr Thomas Headley ( thomasheadley208@gmail.com ) with the below details as stated. NAME: CONTACT EMAIL ADDRESS: HOME ADDRESS: PHONE NUMBER: OCCUPATION: All necessary information on what to do in receiving your winnings will be treated by our claim agent once contacted by you and Contact him by providing him with your Mercedes Benz Online Lottery Draw Reference Claim Code:(W70902039) As the subject of your email for swift response. NB : If you received this message in your SPAM/BULK folder, that is because of the restrictions implemented by your Internet Service Provider, we (MERCEDES GROUP) urge you to treat it genuinely .Delivery fee is mandatory and at your cost in claiming your win."
title = id2word.doc2bow(simple_preprocess(unseen_document1))
for index, score in lda_model[title]:
    print("TOPIC: "+str(index))
    print("SCORE:"+str(score))

TOPIC: 0
SCORE:0.09587545
TOPIC: 1
SCORE:0.35417044
TOPIC: 2
SCORE:0.053253103
TOPIC: 3
SCORE:0.3037603
TOPIC: 4
SCORE:0.19294067


In [208]:
unseen_document2 = "Customer Satisfaction Survey Thank you for signing up for a Wand AI account! We value your opinion and would like to learn more about your experience. https://campaign-image.com/zohocampaigns/174213000014649006_zc_v14_1618553690304_april16_2021_img_01.png Please take a minute to respond to this short survey.To thank you for your time, we will enter you into a draw to win a $100 Amazon gift card.Share your feedback >We will announce the winner by Oct 31st. Best regards,Wand AI team"
title = id2word.doc2bow(simple_preprocess(unseen_document2))
for index, score in lda_model[title]:
    print("TOPIC: "+str(index))
    print("SCORE:"+str(score))

TOPIC: 0
SCORE:0.124330215
TOPIC: 1
SCORE:0.6657958
TOPIC: 3
SCORE:0.1979643


In [209]:
unseen_document3 = "Stock up on our best-selling products Sustainable Hoodie - Adventures Ahead Sustainable Hoodie - Adventures Ahead £45.00 Sustainable Hoodie - Boat Life Sustainable Hoodie - Boat Life £45.00 Sustainable Hoodie - Bonfire Scene Sustainable Hoodie - Bonfire Scene £45.00 Sustainable Hoodie - Find Your Wild Sustainable Hoodie - Find Your Wild £45.00"
title = id2word.doc2bow(simple_preprocess(unseen_document3))
for index, score in lda_model[title]:
    print("TOPIC: "+str(index))
    print("SCORE:"+str(score))

TOPIC: 0
SCORE:0.44893363
TOPIC: 1
SCORE:0.5101302
TOPIC: 2
SCORE:0.013429614
TOPIC: 3
SCORE:0.013789283
TOPIC: 4
SCORE:0.013717208


### Calcolo distanza semantica tra i topic

In [89]:
#Scelta modello pre addestrato
print(list(gensim.downloader.info()['models'].keys())) 

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [71]:
vectors_model_1 = gensim.downloader.load('fasttext-wiki-news-subwords-300')
# vectors_model_2 = gensim.downloader.load('conceptnet-numberbatch-17-06-300')
# vectors_model_3 = gensim.downloader.load('word2vec-ruscorpora-300')
# vectors_model_4 = gensim.downloader.load('word2vec-google-news-300')
# vectors_model_5 = gensim.downloader.load('glove-wiki-gigaword-300')

In [210]:
#Definizione vettore
def avg_vector(model, topics_keywords):
    to_remove = 0
    vector = np.zeros(300)
    for word in topics_keywords:
        if word in model.key_to_index.keys():
            vector += model.get_vector(word)
        else:
            to_remove += 1
    if len(topics_keywords)== to_remove:
        return np.zeros(300)
        
    return vector/(len(topics_keywords)-to_remove)

In [211]:
vectors = []

for topic in topics_keywords:
    vectors.append(avg_vector(vectors_model_1,topic))

vectors

In [212]:
print(1 - spatial.distance.cosine(vectors[0], vectors[1]))
print(1 - spatial.distance.cosine(vectors[0], vectors[2]))
print(1 - spatial.distance.cosine(vectors[0], vectors[3]))
print(1 - spatial.distance.cosine(vectors[0], vectors[4]))
print(1 - spatial.distance.cosine(vectors[1], vectors[2]))
print(1 - spatial.distance.cosine(vectors[1], vectors[3]))
print(1 - spatial.distance.cosine(vectors[1], vectors[4]))
print(1 - spatial.distance.cosine(vectors[2], vectors[3]))
print(1 - spatial.distance.cosine(vectors[2], vectors[4]))
print(1 - spatial.distance.cosine(vectors[3], vectors[4]))

0.7042992042381923
0.582014002079258
0.6192279990372652
0.6087980974731353
0.6312986832418708
0.6567055594573727
0.7278595241006097
0.5049273481922427
0.6773585883638853
0.5498630079922815


I topic risultano avere un grado di similarità non troppo basso in quanto si riferiscono comunque a mail e a messaggi di spam e non troppo alto (<75%)

### Estrarre organizzazioni da non spam

In [271]:
#Recupero le mail no spam dal test set
predictions_with_index = pd.DataFrame({
    'index': original_indices,
    'prediction': y_pred
})
predictions_with_index = predictions_with_index[predictions_with_index['prediction'] == 0]
no_spam_df = predictions_with_index.merge(X_test, left_on='index', right_index=True)
no_spam_df

Unnamed: 0,index,prediction,text
0,3252,0,Subject: hpl / conoco - teco waha 03 / 23 / 01...
1,3177,0,Subject: holiday on - call data\npipeline cont...
2,629,0,Subject: gas day 2 / 08 / 01\nwe agree :\nteco...
3,4155,0,Subject: sarco lateral and crow o ' connor met...
4,980,0,Subject: natural gas nomination for 09 / 00\ne...
...,...,...,...
1489,2871,0,Subject: holiday invitation\nplease click on t...
1490,3646,0,Subject: tenaska iv 8 / 01\ndarren :\nplease z...
1494,44,0,Subject: re : potential volume list for march ...
1495,1776,0,"Subject: revision # 1 - hpl nom for sept . 8 ,..."


In [272]:
# Set di stopwords per ricerca veloce
english_stopwords = set(stop_words)

#Pulizia a posteriori
def clean_organizations(org_list):
    """
    Filtra e pulisce ulteriormente le organizzazioni estratte.
    Rimuove entità con numeri, entità troppo lunghe e altre anomalie.
    """
    cleaned_orgs = []
    for org in org_list:
        if not any(char.isdigit() for char in org) and len(org.split()) < 5:
            cleaned_orgs.append(org.strip())
    return cleaned_orgs

In [273]:
no_spam_df_cleaned = data_cleaner(no_spam_df['text'])
# type(no_spam_df_cleaned)

In [274]:
no_spam_df_cleaned

['hpl conoco teco waha    purchase daren conoco invoice hpl     pgev waha deal ticket  show   confirm price thank',
 'holiday call data pipeline contact phone fax pager black marlin blair lichentwalter       h    debbie thompson       nom due today  rd  th channel jim tobacco    gas control     open thursday nom due monday centana william spekels       donna spencer    gas control     nom due today  rd  th duke energy annette anderson       call bob moseman    thursday open tomorrow nom due thru  th lonestar gary gafford       gas control     nom due today  rd thru  th northern natural ben markey   cell       call charlie mosey   gas control  open thursday nom due thru  th east trans east texas tejas gas control    paula svehla    mickey chapman    open thursday nom due thru  th midcon  k ken nachlinger          call steven     k    gas control    nom due today  rd thru  th moss bluff current business',
 'gas day    agree teco tap nom   actual   forward melissa jones texas utility     

In [275]:
nlp = spacy.load('en_core_web_lg')

organizations = []

# Estrazione delle organizzazioni dalle email 
for email in no_spam_df_cleaned:  
    doc = nlp(email) 
    for ent in doc.ents:
        if ent.label_ == "ORG": 
            organizations.append(ent.text)
cleaned_organizations = clean_organizations(organizations)
# Conta la frequenza 
org_freq = Counter(cleaned_organizations)

org_df = pd.DataFrame(org_freq.items(), columns=['Organization', 'Frequency'])




In [280]:
org_df.sort_values(by = 'Frequency', ascending = False)

Unnamed: 0,Organization,Frequency
32,enron,203
7,xls,125
3,teco,101
10,sitara,49
15,daren j,40
...,...,...
401,dbq,1
402,nom ce,1
403,srm,1
404,nom party,1
