In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from nltk.corpus import stopwords
import re
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer, PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
sms_data=pd.read_csv('/content/sms.csv')
sms_data.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
sms_data.isnull().sum()

sms      0
label    0
dtype: int64

In [5]:
sms_data['label'].value_counts()

0    4827
1     747
Name: label, dtype: int64

In [6]:
x=sms_data['sms']

In [7]:
y=sms_data['label']

In [8]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

## **Text preprocessing:**

In [10]:
# converting text to lower case
def convert_to_lowercase(text):
    return text.str.lower()

In [11]:
# removing punctuations
def remove_punctuations(text):
    eng_punctuation = string.punctuation
    translator = str.maketrans('','', eng_punctuation)
    return str(text).translate(translator)

In [12]:
# removing stop words(ex:is,an,the....)
def remove_stopwords(text):
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    return " ".join([word for word in str(text).split() if word not in stop_words])

In [13]:
def remove_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

In [14]:
#removing any numerical value
def remove_numeric(text):
    return re.sub('[0-9]+', '', text)

In [15]:
def tokenize_text(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

In [16]:
#Converting some of the words to their root form
def text_lematization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text

In [17]:
#applying all previous functions
def preprocess(text):
    text = convert_to_lowercase(text)
    text = text.apply(lambda x : remove_punctuations(x))
    text = text.apply(lambda x : remove_stopwords(x))
    text = text.apply(lambda x : remove_repeating_characters(x))
    text = text.apply(lambda x : remove_numeric(x))
    text = tokenize_text(text)
    text = text.apply(lambda x : text_lematization(x))
    text = text.apply(lambda x: " ".join(x))
    return text

In [18]:
x_train = preprocess(x_train)

In [19]:
x_test = preprocess(x_test)

# converting text data into numerical representations suitable for model training using text vectorization:

In [20]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x_train)

In [21]:
x_train = vectorizer.transform(x_train)

In [22]:
x_test = vectorizer.transform(x_test)

# **Models:**

In [23]:
# logistic regression model
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

In [24]:
lr_pred = lr_model.predict(x_test)

In [25]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1447
           1       0.94      0.75      0.84       226

    accuracy                           0.96      1673
   macro avg       0.95      0.87      0.91      1673
weighted avg       0.96      0.96      0.96      1673



In [26]:
train_lr_pred = lr_model.predict(x_train)

In [27]:
print(classification_report(y_train, train_lr_pred)) #no overfitting found

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3380
           1       0.99      0.73      0.84       521

    accuracy                           0.96      3901
   macro avg       0.98      0.87      0.91      3901
weighted avg       0.96      0.96      0.96      3901



In [28]:
#KNeighborsClassifier model
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)

In [29]:
knn_pred = knn_model.predict(x_test)

In [30]:
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1447
           1       1.00      0.31      0.48       226

    accuracy                           0.91      1673
   macro avg       0.95      0.66      0.71      1673
weighted avg       0.92      0.91      0.89      1673



In [31]:
train_knn_pred = knn_model.predict(x_train)
print(classification_report(y_train, train_knn_pred)) # no overfitting found(but not a preferred model)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3380
           1       1.00      0.37      0.54       521

    accuracy                           0.92      3901
   macro avg       0.96      0.69      0.75      3901
weighted avg       0.92      0.92      0.90      3901



In [32]:
x_train_array=x_train.toarray()
x_test_array=x_test.toarray()
nb_model = GaussianNB()
nb_model.fit(x_train_array, y_train)
nb_pred = nb_model.predict(x_test_array)

In [33]:
print(classification_report(y_test, nb_pred))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1447
           1       0.54      0.85      0.66       226

    accuracy                           0.88      1673
   macro avg       0.76      0.87      0.79      1673
weighted avg       0.92      0.88      0.89      1673



In [34]:
train_nb_pred = nb_model.predict(x_train_array)
print(classification_report(y_train, train_nb_pred)) # no overfitting but not a prefered model

              precision    recall  f1-score   support

           0       1.00      0.94      0.97      3380
           1       0.72      1.00      0.84       521

    accuracy                           0.95      3901
   macro avg       0.86      0.97      0.91      3901
weighted avg       0.96      0.95      0.95      3901



In [35]:
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [None]:
svm_pred = svm_model.predict(x_test)

In [None]:
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1447
           1       0.97      0.86      0.91       226

    accuracy                           0.98      1673
   macro avg       0.98      0.93      0.95      1673
weighted avg       0.98      0.98      0.98      1673



In [None]:
train_svm_pred = svm_model.predict(x_train)
print(classification_report(y_train, train_svm_pred)) # no overfitting and appropriate

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3380
           1       1.00      0.99      0.99       521

    accuracy                           1.00      3901
   macro avg       1.00      0.99      1.00      3901
weighted avg       1.00      1.00      1.00      3901



In [None]:
dt_model = tree.DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [None]:
dt_pred = dt_model.predict(x_test)

In [None]:
print(classification_report(y_test, dt_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1447
           1       0.87      0.78      0.82       226

    accuracy                           0.95      1673
   macro avg       0.92      0.88      0.90      1673
weighted avg       0.95      0.95      0.95      1673



In [None]:
train_dt_pred = dt_model.predict(x_train)
print(classification_report(y_train, train_dt_pred)) #no overfitting

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3380
           1       1.00      1.00      1.00       521

    accuracy                           1.00      3901
   macro avg       1.00      1.00      1.00      3901
weighted avg       1.00      1.00      1.00      3901



In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)

In [None]:
rf_pred = rf_model.predict(x_test)

In [None]:
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1447
           1       0.99      0.83      0.90       226

    accuracy                           0.98      1673
   macro avg       0.98      0.91      0.94      1673
weighted avg       0.98      0.98      0.97      1673



In [None]:
train_rf_pred = rf_model.predict(x_train)
print(classification_report(y_train, train_rf_pred)) #no overfitting and appropriate

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3380
           1       1.00      1.00      1.00       521

    accuracy                           1.00      3901
   macro avg       1.00      1.00      1.00      3901
weighted avg       1.00      1.00      1.00      3901



In [None]:
xg_model = XGBClassifier()
xg_model.fit(x_train, y_train)

In [None]:
xg_pred = xg_model.predict(x_test)
print(classification_report(y_test, xg_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1447
           1       0.95      0.81      0.88       226

    accuracy                           0.97      1673
   macro avg       0.96      0.90      0.93      1673
weighted avg       0.97      0.97      0.97      1673



In [None]:
train_xg_pred = xg_model.predict(x_train)
print(classification_report(y_train, train_xg_pred)) #no overfitting and appropriate

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3380
           1       1.00      0.92      0.96       521

    accuracy                           0.99      3901
   macro avg       0.99      0.96      0.98      3901
weighted avg       0.99      0.99      0.99      3901

