In [97]:
import pandas as pd
import numpy as np
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])
# imprima las primeras 5 filas
df.head()
#Cambiamos las etiquetas texutales por etiquetas numericas, esto es una buena practica cuando se construyen modelo supervisados
df['label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head() # returns (rows, columns)

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [113]:
#Cuenta cuantas veces la palabra es spam o ham en todo el documento
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english")

datos={"contSpam":[],"contHam":[]}
mapa=dict() #Mapa para guardar el index de la palabra que va en datos

tam=len(df)
tam75=int(tam*0.75) 

totalSpam=0
totalHam=0
for i in range(tam75):
    valSpam=0
    valHam=0
    if( df["label"][i]==1):
        valSpam=1
        totalSpam+=1
    else:
        valHam=1
        totalHam+=1 
    listWord=[]
    
    try: 
        count_vector.fit([df["sms_message"][i]])
        listWord=count_vector.get_feature_names()
    except ValueError:
        rellenar=0
        
    for word in listWord:
        if word in mapa:
            #Verifica para que la palabra tomada no este más de 1 vez en el documento
            if i not in mapa[word]["docs"]:
                mapa[word]["docs"][i]=1                
                pos=mapa[word]["pos"]
                datos["contSpam"][pos]+=valSpam
                datos["contHam"][pos]+=valHam
        else:
            pos=len(datos["contSpam"])
            mapa[word]={"pos":pos,"docs":{i:1}}
            datos["contSpam"].append(valSpam)
            datos["contHam"].append(valHam)

#Agregar número total de documentos de stop words y demás
mapa["TOTAL DOC"]=len(datos["contSpam"])
datos["contSpam"].append(totalSpam)
datos["contHam"].append(totalHam)             
newDf = pd.DataFrame(datos, columns= ['contSpam', 'contHam'], index=list(mapa))
newDf.index.name="word"
newDf.to_csv (r'conteoSpamHam.csv', index = True, header=True) #Don't forget to add '.csv' at the end of the path
newDf

Unnamed: 0_level_0,contSpam,contHam
word,Unnamed: 1_level_1,Unnamed: 2_level_1
amore,0,1
available,2,13
buffet,0,2
bugis,0,6
cine,0,7
...,...,...
florida,0,1
hidden,0,1
royal,0,1
tog,0,1


In [114]:
#Se crea el archivo probabilidades de que la palabra sea spam o ham
tamano=len(newDf)  
newDf['contSpam'] = newDf['contSpam'].astype(float)
newDf['contHam'] = newDf['contHam'].astype(float)

for indice_fila,fila  in newDf.iterrows():
    if(indice_fila!="TOTAL DOC"):
        newDf["contSpam"][indice_fila]=newDf["contSpam"][indice_fila]/newDf["contSpam"]["TOTAL DOC"]
        newDf["contHam"][indice_fila]=newDf["contHam"][indice_fila]/newDf["contHam"]["TOTAL DOC"]

totalDocs=newDf["contHam"]["TOTAL DOC"]+newDf["contSpam"]["TOTAL DOC"]
newDf["contHam"]["TOTAL DOC"]=newDf["contHam"]["TOTAL DOC"]/totalDocs
newDf["contSpam"]["TOTAL DOC"]=newDf["contSpam"]["TOTAL DOC"]/totalDocs
newDf.rename(columns={"contSpam": "probSpam", "contHam": "probHam"},inplace = True) 
newDf.to_csv (r'probSpamHam.csv',index = True, header=True) #Don't forget to add '.csv' at the end of the path

In [115]:
probDataset=pd.read_csv("probSpamHam.csv",index_col="word") 
probDataset

Unnamed: 0_level_0,probSpam,probHam
word,Unnamed: 1_level_1,Unnamed: 2_level_1
amore,0.00000,0.000277
available,0.00354,0.003597
buffet,0.00000,0.000553
bugis,0.00000,0.001660
cine,0.00000,0.001937
...,...,...
florida,0.00000,0.000277
hidden,0.00000,0.000277
royal,0.00000,0.000277
tog,0.00000,0.000277


In [117]:
#Matriz confusion  row=(Spam,Ham) col=(Spam,Ham)
matrizConfusion=[[0, 0],[0 ,0]]
for i in range(tam75,tam):
    is_Spam=False
    is_Ham=False
    if( df["label"][i]==1):
        is_Spam=True 
    else:
        is_Ham=True
    
    listWord=[]
    try: 
        count_vector.fit([df["sms_message"][i]])
        listWord=count_vector.get_feature_names()
    except ValueError:
        rellenar=0
    pSpam=None
    pHam=None
    for word in listWord:
        if word in probDataset["probSpam"]:
            if pSpam == None:
                pSpam=probDataset["probSpam"][word]
                pHam=probDataset["probHam"][word]
            else:
                pSpam*=probDataset["probSpam"][word]
                pHam*=probDataset["probHam"][word]
                
    if pSpam==None:
        pSpam=0
        pHam=0
    
   
    if pSpam != pHam:
        pSpam*=probDataset["probSpam"]["TOTAL DOC"]
        pHam*=probDataset["probHam"]["TOTAL DOC"]
        if pSpam>pHam:
            if is_Spam:
                matrizConfusion[0][0]+=1
            else:
                matrizConfusion[0][1]+=1
        else:
            if is_Ham:
                matrizConfusion[1][1]+=1
            else:
                matrizConfusion[1][0]+=1
    else:
        if is_Spam:
            matrizConfusion[0][0]+=1
        else:
            matrizConfusion[1][1]+=1
matrizConfusion    

[[173, 17], [9, 1194]]

In [118]:
dfMatrizConfunsion= pd.DataFrame(matrizConfusion , columns= ['Spam', 'Ham'], index=["Spam","Ham"])
dfMatrizConfunsion

Unnamed: 0,Spam,Ham
Spam,173,17
Ham,9,1194


In [121]:
TP=matrizConfusion[0][0]
FN=matrizConfusion[0][1]
FP=matrizConfusion[1][0]
TN=matrizConfusion[1][1]

aS_IA=0.9885139985642498
p_IA=0.9720670391061452
r_IA=0.9405405405405406
F1_IA=0.9560439560439562

aS=(TP+TN)/(TP+TN+FP+FN)
r=(TP)/(TP+FN)
p=(TP)/(TP+FP)
F1=(2*recall*precision)/(recall+precision)

matrizFinal=[[aS,aS_IA, abs(aS-aS_IA) ],[p,p_IA,abs(p-p_IA)],[r,r_IA,abs(r-r_IA)],[F1,F1_IA,abs(F1-F1_IA)]]
dfMatrizFinal= pd.DataFrame(matrizFinal , columns= ['Hecho por mi', 'scikit-learn',"Diferencia"], index=["Accurancy Score","Precision Score","Recall Score","F1 Score"])
print("Conclusión: La diferencia con respecto al F1 Score dio 2%, esto quiere decir que la eficiencia de scikit-learn es mucho mejor prediciendo debido a principalmente lo siguiente: ",
      "\n hubo un 3% de diferencia con respecto al Recall queriendo decir que no se trajo todos los relevantes y un 2% en la precisión. "
      ,"\n "
      ,"1. Cuando la probabilidad era igual a que  fuera SPAM o HAM ")
dfMatrizFinal

Conclusión: La diferencia con respecto al F1 Score dio 2%, esto quiere decir que la eficiencia de scikit-learn es mucho mejor prediciendo debido a principalmente lo siguiente:  
 hubo un 3% de diferencia con respecto al Recall queriendose decir que no se trajo todos los relevantes y un 2% en la precisión.  
  1. Cuando la probabilidad era igual a que  fuera SPAM o HAM 


Unnamed: 0,Hecho por mi,scikit-learn,Diferencia
Accurancy Score,0.981335,0.988514,0.007179
Precision Score,0.950549,0.972067,0.021518
Recall Score,0.910526,0.940541,0.030014
F1 Score,0.930108,0.956044,0.025936


In [103]:
print(tam75)

4179
