In [24]:
import pandas as pd

In [25]:
ds=pd.read_csv(r'D:\Road to ML\datasets\SMS Spam Dataset.csv')

In [26]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def Clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters except spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove URLs (if starting with 'http')
    text = re.sub(r'http\S+', '', text)
    
    # Tokenize the text
    words = word_tokenize(text)

    stopword=set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stopword] 
    stemmer= PorterStemmer()
    stemmed_words= [stemmer.stem(word) for word in filtered_words]

    Clean_text_ = ' '.join(stemmed_words)
    
    return Clean_text_

# Example usage
text = r"This is my new skill eating 123 @%^^% https\qasda.com"
print(Clean_text(text))


new skill eat


In [27]:
ds['Clean_text']= ds['sms'].apply(lambda x: Clean_text(x))

In [28]:
ds.head()

Unnamed: 0,sms,label,Clean_text
0,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazi avail bugi n great world...
1,Ok lar... Joking wif u oni...\n,0,ok lar joke wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entri wkli comp win fa cup final tkt st m...
3,U dun say so early hor... U c already then say...,0,u dun say earli hor u c alreadi say
4,"Nah I don't think he goes to usf, he lives aro...",0,nah dont think goe usf live around though


In [29]:
ds['label'].value_counts()

label
0    4827
1     747
Name: count, dtype: int64

In [30]:
from imblearn.over_sampling import RandomOverSampler

x=ds.drop('label',axis=1)
y=ds['label']
OS= RandomOverSampler()
x_resample,y_resample= OS.fit_resample(x,y)

ds_balanced=pd.DataFrame(x_resample,columns=x.columns)
ds_balanced['label']=y_resample

ds_balanced['label'].value_counts()

label
0    4827
1    4827
Name: count, dtype: int64

In [31]:
from sklearn.model_selection  import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x_train,x_test,y_train,y_test=train_test_split(ds_balanced['Clean_text'],ds_balanced['label'],test_size=0.2,random_state=42)

tfid= TfidfVectorizer()
x_train_tfidf=tfid.fit_transform(x_train)
x_test_tfidf=tfid.transform(x_test)

In [32]:
x_train_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

rf=RandomForestClassifier(random_state=42)
rf.fit(x_train_tfidf,y_train)
y_rf_pred=rf.predict(x_test_tfidf)

CM=confusion_matrix(y_test,y_rf_pred)
CR=classification_report(y_test,y_rf_pred)

print('Confusion Matrix')
print(CM)
print('Classification Report')
print(CR)

Confusion Matrix
[[992   2]
 [  5 932]]
Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       994
           1       1.00      0.99      1.00       937

    accuracy                           1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931



In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report

lr=LogisticRegression()
lr.fit(x_train_tfidf,y_train)
y_lr_pred=lr.predict(x_test_tfidf)

CM=confusion_matrix(y_test,y_lr_pred)
CR=classification_report(y_test,y_lr_pred)

print('Confusion Matrix')
print(CM)
print('Classification Report')
print(CR)

Confusion Matrix
[[982  12]
 [ 11 926]]
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       994
           1       0.99      0.99      0.99       937

    accuracy                           0.99      1931
   macro avg       0.99      0.99      0.99      1931
weighted avg       0.99      0.99      0.99      1931



In [35]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report

svc=SVC(kernel='linear',random_state=42)
svc.fit(x_train_tfidf,y_train)
y_svc_pred=svc.predict(x_test_tfidf)

CM=confusion_matrix(y_test,y_svc_pred)
CR=classification_report(y_test,y_svc_pred)

print('Confusion Matrix')
print(CM)
print('Classification Report')
print(CR)

Confusion Matrix
[[992   2]
 [  3 934]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       994
           1       1.00      1.00      1.00       937

    accuracy                           1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931



In [36]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix,classification_report
EC=VotingClassifier(estimators=[
    ('RandomForest',rf),
    ('Logistic Regression ',lr),
    ('SVM',svc),
],voting='hard')


EC.fit(x_train_tfidf,y_train)
y_EC_pred=svc.predict(x_test_tfidf)

CM=confusion_matrix(y_test,y_EC_pred)
CR=classification_report(y_test,y_EC_pred)

print('Confusion Matrix')
print(CM)
print('Classification Report')
print(CR)

Confusion Matrix
[[992   2]
 [  3 934]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       994
           1       1.00      1.00      1.00       937

    accuracy                           1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931



#save files 

In [37]:
import pickle
pickle.dump(rf,open('Randomforest.pkl','wb'))
pickle.dump(EC,open('ensemble.pkl','wb'))

with open('Randomforest.pkl','rb') as file:
    lr=pickle.load(file)

with open('ensemble.pkl','rb') as file:
    EC=pickle.load(file)

# Detechtion System

In [43]:
def real_or_fake(text):
    ct=Clean_text(text)
    vt=tfid.transform([ct])
    result=lr.predict(vt)
    return result
    

In [47]:
input_text="8007 25p 4 Alfie Moon's Children in Need song on ur mob. Tell ur m8s. Txt TONE CHARITY to 8007 for nokias or POLY CHARITY for polys :zed 08701417012 profit 2 charity "

ans=real_or_fake(input_text)
print(ans)

[1]
