# TF-IDF

In [1]:
# pip install "numpy<2" --upgrade

In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
path = '../All_data_sets/nlp/spam.csv'

In [4]:
df = pd.read_csv( path , encoding='iso-8859-1' , names=['target' , 'text','0','1','2'] )
df.head()

Unnamed: 0,target,text,0,1,2
0,v1,v2,,,
1,ham,"Go until jurong point, crazy.. Available only ...",,,
2,ham,Ok lar... Joking wif u oni...,,,
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
4,ham,U dun say so early hor... U c already then say...,,,


In [5]:
df.shape

(5573, 5)

In [6]:
drop_columns = ['0', '1', '2']

In [7]:
df.drop(columns=drop_columns,inplace=True)

In [8]:
df.head()

Unnamed: 0,target,text
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...


In [9]:
df = df.iloc[ 1: , 0: ]

df

Unnamed: 0,target,text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


## Data cleaning and preprocessing

In [10]:
import nltk
import re

# remove the stop words
from nltk.corpus import stopwords

# reduce the words to its root word
from nltk.stem import WordNetLemmatizer

In [11]:
lemma = WordNetLemmatizer()

In [12]:
stop_words = stopwords.words('english')

In [13]:
def preprocessing( message ):

    # take only the words
    review = re.sub( '[^a-zA-Z]' , ' ' , message )

    # lower case the words
    review = review.lower()

    # split the words into list
    review = review.split()

    # apply stop words and stemming
    review = [lemma.lemmatize(word) for word in review if not word in set(stop_words)]

    # join the words to form the sentence
    review = ' '.join(review)
    
    # print(review)
    
    return review

In [14]:
df['cleaned_text'] = df['text'].apply( lambda x:preprocessing(x) )

In [15]:
df

Unnamed: 0,target,text,cleaned_text
1,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
2,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
4,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
5,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though
...,...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5569,ham,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5570,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestion
5571,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

### fot top 200 features/words

In [17]:
tfidf = TfidfVectorizer( max_features=200 )

X = tfidf.fit_transform( df['cleaned_text'] ).toarray()

In [18]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
X[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.43445002, 0.        , 0.        , 0.        , 0.        ,
       0.46150602, 0.54380143, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [20]:
tfidf.get_feature_names_out()

array(['already', 'also', 'always', 'amp', 'anything', 'around', 'ask',
       'babe', 'back', 'best', 'bit', 'box', 'buy', 'call', 'cant', 'car',
       'care', 'cash', 'chat', 'claim', 'class', 'co', 'com', 'come',
       'coming', 'contact', 'cost', 'could', 'customer', 'da', 'day',
       'dear', 'done', 'dont', 'draw', 'dun', 'end', 'even', 'every',
       'feel', 'find', 'fine', 'finish', 'first', 'free', 'friend', 'get',
       'getting', 'girl', 'give', 'go', 'god', 'going', 'gonna', 'good',
       'got', 'great', 'gt', 'guaranteed', 'gud', 'guy', 'haha', 'happy',
       'heart', 'hello', 'help', 'hey', 'hi', 'holiday', 'home', 'hope',
       'hour', 'hr', 'im', 'job', 'keep', 'know', 'last', 'late', 'later',
       'leave', 'let', 'life', 'like', 'line', 'live', 'lol', 'lor',
       'lot', 'love', 'lt', 'lunch', 'make', 'man', 'many', 'may', 'mean',
       'meet', 'message', 'min', 'minute', 'miss', 'mobile', 'money',
       'month', 'morning', 'msg', 'much', 'name', 'need', '

In [21]:
len(tfidf.get_feature_names_out())

200

### for n-gram

In [22]:
tfidf = TfidfVectorizer( max_features=200 , ngram_range=(2,2) )

X = tfidf.fit_transform( df['cleaned_text'] ).toarray()

In [23]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
X[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [25]:
len(tfidf.get_feature_names_out())

200

In [26]:
tfidf.get_feature_names_out()

array(['account statement', 'across sea', 'anytime network',
       'attempt contact', 'await collection', 'awarded bonus',
       'bonus caller', 'bt national', 'call claim', 'call customer',
       'call free', 'call identifier', 'call land', 'call landline',
       'call later', 'call min', 'call mobile', 'call mobileupd',
       'call optout', 'call per', 'call reply', 'caller prize',
       'camcorder reply', 'camera phone', 'cant pick', 'cash await',
       'cash prize', 'chance win', 'claim call', 'claim code', 'claim ur',
       'claim valid', 'co uk', 'code expires', 'collection sae',
       'come back', 'come home', 'come tomorrow', 'contact today',
       'customer service', 'dating service', 'decimal gt', 'dont know',
       'double min', 'draw show', 'draw txt', 'dun wan', 'easy call',
       'entry weekly', 'every week', 'every wk', 'feel like',
       'first time', 'free call', 'free entry', 'free st', 'free text',
       'fullonsms com', 'get back', 'getzed co', 'gift v

In [27]:
y = df['target']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train , X_test , y_train  , y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
knn = KNeighborsClassifier()

In [32]:
knn.fit( X_train,y_train )

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [33]:
y_pred = knn.predict( X_test )

In [34]:
y_pred_train = knn.predict(X_train)

In [35]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

## Training dataset report

In [36]:
confusion_matrix(y_train , y_pred_train)

array([[3614,    9],
       [ 168,  388]], dtype=int64)

In [37]:
print( classification_report(y_train , y_pred_train) )

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3623
        spam       0.98      0.70      0.81       556

    accuracy                           0.96      4179
   macro avg       0.97      0.85      0.90      4179
weighted avg       0.96      0.96      0.95      4179



In [38]:
accuracy_score( y_train , y_pred_train )

0.9576453697056713

## Training dataset report

In [39]:
confusion_matrix(y_test , y_pred)

array([[1195,    7],
       [  68,  123]], dtype=int64)

In [40]:
print( classification_report(y_test , y_pred) )

              precision    recall  f1-score   support

         ham       0.95      0.99      0.97      1202
        spam       0.95      0.64      0.77       191

    accuracy                           0.95      1393
   macro avg       0.95      0.82      0.87      1393
weighted avg       0.95      0.95      0.94      1393



In [41]:
accuracy_score(y_test , y_pred)

0.9461593682699211