In [53]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import matplotlib.pyplot as plt
import seaborn as sn

In [45]:
PATH = r'C:\Users\maxje\OneDrive\Pulpit\python\zad nlp\SMSSpamCollection.txt'

columnes = ['Category', 'Text']
df = pd.read_csv(PATH, sep='\t', names=columnes, header=None)
df.dtypes

Category    object
Text        object
dtype: object

# Data cleaning

In [46]:
def clean_text(x):
    x = x.lower()
    x = x.replace('.','')
    x = x.replace(',','')
    x = x.replace(':','')
    x = x.replace(';','')
    x = x.replace('!','')
    x = x.replace('?','')
    x = x.replace('<br>','')
    x = x.replace('<br />','')
    x = ' '.join([word for word in x.split() if word.isalpha()])
    return x

def binary(y):
    y = str(y)
    if y == "spam":
        return 1
    else:
        return 0
    
df['Text'] = df['Text'].apply(clean_text)
df['Category'] = df['Category'].apply(binary)

df

Unnamed: 0,Category,Text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final ...
3,0,u dun say so early hor u c already then say
4,0,nah i think he goes to usf he lives around her...
...,...,...
5567,1,this is the time we have tried contact u u hav...
5568,0,will ü b going to esplanade fr home
5569,0,pity was in mood for that soany other suggestions
5570,0,the guy did some bitching but i acted like be ...


# Model learning

In [47]:
tfidf = TfidfVectorizer(min_df=10)
inputs= tfidf.fit_transform(df['Text'])
target = df['Category']

train_input,test_input,train_target,test_target = train_test_split(inputs,target, test_size=0.25)

In [48]:
model = LogisticRegression()
model.fit(train_input,train_target)

LogisticRegression()

# Results

In [49]:
predictions = model.predict(test_input)

print("Is spam precision:",precision_score(test_target,predictions))
print("Is spam recall:",recall_score(test_target,predictions))
print("Not spam precision:",precision_score(test_target,predictions,pos_label = 0))
print("Not spam recall:",recall_score(test_target,predictions,pos_label = 0))
print("Classification accuracy:",accuracy_score(test_target,predictions))

Is spam precision: 0.9933333333333333
Is spam recall: 0.8010752688172043
Not spam precision: 0.9702333065164923
Not spam recall: 0.9991714995857498
Classification accuracy: 0.9727207465900933


**10 most positive sentiment words**

In [51]:
weights = list(zip(tfidf.get_feature_names(),model.coef_[0]))

weights.sort(key = lambda x:x[1])
print(weights[:10])

[('me', -2.246574617629148), ('my', -1.9688982986576993), ('but', -1.5062502125172539), ('come', -1.4783813411534878), ('later', -1.447887375296455), ('ok', -1.3432156846093644), ('that', -1.244549613259049), ('do', -1.1706469937576673), ('am', -1.1181200614176106), ('sorry', -1.1057575343467512)]


**10 most negative sentiment words**

In [52]:
weights.sort(key = lambda x:x[1],reverse = True)
print(weights[:10])

[('txt', 4.884225139101268), ('call', 4.161147239559108), ('text', 3.5544625324746546), ('stop', 3.1960449961816697), ('free', 3.1215738181762855), ('to', 3.0488898045629114), ('reply', 3.0191690475771047), ('claim', 2.947874350925159), ('mobile', 2.799268192017337), ('from', 2.635901672370731)]
