<a href="https://colab.research.google.com/github/Gzaborey/python_applications_classifier/blob/main/Applications_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import string
import re
import sklearn.model_selection as sk_ms
import sklearn.preprocessing as sk_preprocessing
import sklearn.metrics as sk_metrics
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Defining Functions

def process_text(text):
  """Removes punctuation, whitespases, stopwords, english words, short words and digits.
     Returns list of words."""

  stemmer = SnowballStemmer('russian')
  eng_regex = re.compile(r'[a-zA-Z]')

  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  processed_words = [stemmer.stem(word.lower()) for word in nopunc.split()\
                      if word.lower() not in stopwords.words('russian')\
                      and word.lower() not in stopwords.words('english')\
                      and word.lower().isalpha()\
                      and len(word.lower()) > 4\
                      and word.lower() not in ''.join(eng_regex.findall(word.lower()))]
  return processed_words 

def validate_name(string_to_validate, name_data):
  name_validity = 0
  for word in string_to_validate.lower().split():
    if word in name_data:
      name_validity = 1
      break
    else:
      name_validity = 0
  return name_validity

def validate_phone_number(phone_number):
  temp1 = [char for char in phone_number if (char not in string.punctuation \
                                             and char is not ' ')]
  temp2 = ''.join(temp1)
  if len(temp2) > 12 or len(temp2) < 9:
    return 0
  else:
    for char in range(7):
      temp1.pop()
    temp2 = ''.join(temp1)
    pattern1 = re.compile(r'(380|0|)')
    first_condition = temp2[-3::-1][::-1] in pattern1.findall(temp2[-3::-1][::-1])
    pattern2 = re.compile(r'(39|67|68|96|97|98|50|66|95|99|63|93|91|92|94)')
    second_condition = temp2[-1:-3:-1][::-1] in pattern2.findall(temp2[-1:-3:-1][::-1])
    if first_condition and second_condition:
      return 1
    else:
      return 0

# **Preparing the data**

## Loading the data and constructing features

In [None]:
# Loading Data

email_data = pd.read_excel('https://github.com/Gzaborey/python_applications_classifier/blob/main/data/email_data.xlsx?raw=true')
email_data = email_data.drop(['viber', 'telegram', 'datetime', 'age', 'Unnamed: 0'], axis=1, errors='ignore')
email_data = email_data.drop_duplicates()
email_data = email_data.reset_index(drop=True)
email_data = email_data.rename(columns={'add': 'message'})
email_data = email_data.sample(frac=1)
email_data = email_data.reset_index(drop=True)
email_data['message'] = email_data['message'].astype('str')

In [None]:
email_data.head()

Unnamed: 0,name,phone,email,message,is_spam
0,contttjeb,83341271966,fedonovandronja1988@rambler.ru,,1
1,samuelMus,88952436844,c.o.o.p.e.r.a.le.xa.n.drhome.work.848@gmail.com,,1
2,Пищита Андрей,675323672,dronn9853@gmail.com,"Здравствуйте, хочу попасть в сферу it, потом с...",0
3,Laurachifs,86731724951,mosip2020@yandex.ru,,1
4,"483___*** <p style=""text-align:center""><a href...",83818865456,5sentry441@mail.ru,Hello,1


In [None]:
# Observing Data

print(email_data['is_spam'].value_counts())
print()
print(email_data.isna().sum())
print()
print(email_data.dtypes)

1    342
0    214
Name: is_spam, dtype: int64

name        1
phone       1
email      36
message     0
is_spam     0
dtype: int64

name       object
phone      object
email      object
message    object
is_spam     int64
dtype: object


In [None]:
# Filling Missing Values

email_data = email_data.fillna('')
email_data['message'] = email_data['message'].replace('nan', '')
print(email_data.isna().sum())
print()

name       0
phone      0
email      0
message    0
is_spam    0
dtype: int64



In [None]:
email_data.head()

Unnamed: 0,name,phone,email,message,is_spam
0,contttjeb,83341271966,fedonovandronja1988@rambler.ru,,1
1,samuelMus,88952436844,c.o.o.p.e.r.a.le.xa.n.drhome.work.848@gmail.com,,1
2,Пищита Андрей,675323672,dronn9853@gmail.com,"Здравствуйте, хочу попасть в сферу it, потом с...",0
3,Laurachifs,86731724951,mosip2020@yandex.ru,,1
4,"483___*** <p style=""text-align:center""><a href...",83818865456,5sentry441@mail.ru,Hello,1


In [None]:
# Loading Dataframe Of Common Names

names_data = pd.read_csv('https://raw.githubusercontent.com/Gzaborey/python_applications_classifier/main/data/names_dataframe.csv', index_col='Unnamed: 0')
names_list = [str(name).lower() for name in names_data.iloc[:, -1]]

In [None]:
len(names_list)

97974

In [None]:
# Constructing Features

email_data.insert(loc=3, column='email_domain', 
                               value=email_data['email'].apply(lambda x: x.lower().split('@')[-1].strip()))

email_data.insert(loc=1, column='valid_first_name', 
                               value=email_data.name.astype('str')\
                              .apply(lambda x: validate_name(x, names_list)))

email_data.insert(loc=3, column='valid_phone_number', 
                               value=email_data.phone.astype('str').apply(lambda x: validate_phone_number(x)))

## Dividing the data into test and train sets

In [None]:
# Shuffling DataFrame Rows 

email_data = email_data.sample(frac=1)
email_data = email_data.reset_index(drop=True)

X = email_data.drop('is_spam', axis=1)
y = email_data.is_spam

email_data.head(4)

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message,is_spam
0,Heathervoxia,0,86518782113,0,weckspec89@yandex.com,yandex.com,"XEvil 5.0 решает любую капчу, включая Google R...",1
1,Варсобин Александр,1,380679000000,1,varsobin5@gmail.com,gmail.com,,0
2,Olga Shalaieva,1,934437534,1,megapover17@ukr.net,ukr.net,когда,0
3,tulttaedvs,0,12134251453,0,f58c212a7422f9b9ce584fae9d34f26f.roopert@ssema...,ssemarket,Muchas gracias. ?Como puedo iniciar sesion?,1


In [None]:
# Spliting Data Into Train And Test Sets

X_train, X_test, y_train, y_test = sk_ms.train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

X_train.head(10)

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message
0,samuelMus,0,89317595981,0,co.operal.exa.n.dr.h.omewo.rk.848@gmail.com,gmail.com,
1,Александр Медведенко,1,380676000000,1,buromaxod@gmail.com,gmail.com,Свяжитесь со мной пожалуйста..
2,Ирина Бойко,1,380673000000,1,irinaboiko47@gmail.com,gmail.com,
3,Заикин Андрей Викторович,1,672949941,1,masterkrovlia2017@gmail.com,gmail.com,
4,uyuducolo,0,84574274532,0,axojif@otnis.jonjamail.com,otnis.jonjamail.com,
5,Fomin Aleksandr Alekseevich,1,674812290,1,aleksandr.fomin@gmail.com,gmail.com,wew
6,lidasrudova,0,82724636596,0,lidasrudova@yandex.ru,yandex.ru,
7,RonaldSic,0,82495957796,0,7vseaez@goposts.site,goposts.site,
8,Таран Александр,1,967260412,1,,,
9,Fomin Aleksandr Alekseevich,1,674812290,1,aleksandr.fomin@gmail.com,gmail.com,Тест


# **Building and training the model**

In [None]:
email_dom_vectorizer = CountVectorizer()
email_dom_vectorizer.fit(email_data.email_domain)
email_dom_vectorized_train = email_dom_vectorizer.transform(X_train.email_domain)

email_message_vectorizer = CountVectorizer(analyzer=process_text)
email_message_vectorizer.fit(email_data.message)
email_message_vectorized_train = email_message_vectorizer.transform(X_train.message).toarray()

email_message_vectorized_train.shape

(444, 1385)

In [None]:
 #email_message_vectorizer.get_feature_names()

In [None]:
# Naive Bayes model For Email Domain Classification

NB_model = MultinomialNB()
NB_model.fit(email_dom_vectorized_train, y_train)

NB_stats = NB_model.predict(email_dom_vectorized_train)

NB_predictions = NB_model.predict_proba(email_dom_vectorized_train)[:, 1]

In [None]:
print(sk_metrics.classification_report(y_train, NB_stats))
print("Accuracy score:", sk_metrics.accuracy_score(y_train, NB_stats))
print()

              precision    recall  f1-score   support

           0       0.74      0.84      0.79       171
           1       0.89      0.82      0.85       273

    accuracy                           0.82       444
   macro avg       0.81      0.83      0.82       444
weighted avg       0.83      0.82      0.83       444

Accuracy score: 0.8243243243243243



In [None]:
# ANN for Message Content Classification

ANN_model = Sequential()
ANN_model.add(Dense(128, input_dim=email_message_vectorized_train.shape[1], activation='relu'))
ANN_model.add(Dense(64, activation='relu'))
ANN_model.add(Dropout(0.2))
ANN_model.add(Dense(32, activation='relu'))
ANN_model.add(Dense(16, activation='relu'))
ANN_model.add(Dropout(0.2))
ANN_model.add(Dense(8, activation='relu'))
ANN_model.add(Dense(1, activation='sigmoid'))

In [None]:
ANN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
ANN_model.fit(email_message_vectorized_train, y_train, epochs=100, batch_size=20, verbose=0)

<keras.callbacks.History at 0x7f521f5cb990>

In [None]:
ANN_predictions = (ANN_model.predict(email_message_vectorized_train) > 0.5).astype("int32")
ANN_predictions_proba = ANN_model.predict(email_message_vectorized_train)
print(sk_metrics.classification_report(y_train, ANN_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_train, ANN_predictions))

              precision    recall  f1-score   support

           0       1.00      0.36      0.53       171
           1       0.71      1.00      0.83       273

    accuracy                           0.75       444
   macro avg       0.86      0.68      0.68       444
weighted avg       0.82      0.75      0.72       444

Accuracy score: 0.7545045045045045


In [None]:
# Final Logistic Regression Classifier

LR_model = LogisticRegression(solver='liblinear')

In [None]:
# Data for LR

temp_df = X_train.loc[:, ['valid_first_name', 'valid_phone_number']]
temp_df = pd.concat([temp_df, pd.Series(ANN_predictions_proba.reshape(ANN_predictions_proba.shape[0],))], axis=1)
temp_df = pd.concat([temp_df, pd.Series(NB_predictions.reshape(NB_predictions.shape[0],))], axis=1)

temp_df.head()

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1
0,0,0,0.666321,0.190832
1,1,1,4.1e-05,0.190832
2,1,1,0.666321,0.190832
3,1,1,0.666321,0.190832
4,0,0,0.666321,0.985355


In [None]:
LR_model.fit(temp_df, y_train)



LogisticRegression(solver='liblinear')

In [None]:
LR_predictions = LR_model.predict(temp_df)

print(sk_metrics.classification_report(y_train, LR_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_train, LR_predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       171
           1       0.99      1.00      1.00       273

    accuracy                           1.00       444
   macro avg       1.00      0.99      1.00       444
weighted avg       1.00      1.00      1.00       444

Accuracy score: 0.9954954954954955




In [None]:
inspect_df = pd.concat([X_train, pd.Series(LR_predictions)], axis=1)
inspect_df.head()

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message,0
0,samuelMus,0,89317595981,0,co.operal.exa.n.dr.h.omewo.rk.848@gmail.com,gmail.com,,1
1,Александр Медведенко,1,380676000000,1,buromaxod@gmail.com,gmail.com,Свяжитесь со мной пожалуйста..,0
2,Ирина Бойко,1,380673000000,1,irinaboiko47@gmail.com,gmail.com,,0
3,Заикин Андрей Викторович,1,672949941,1,masterkrovlia2017@gmail.com,gmail.com,,0
4,uyuducolo,0,84574274532,0,axojif@otnis.jonjamail.com,otnis.jonjamail.com,,1


# **Testing the model on test data set**

In [None]:
#Transforming test data

email_dom_vectorized_test = email_dom_vectorizer.transform(X_test.email_domain)

email_message_vectorized_test = email_message_vectorizer.transform(X_test.message).toarray()

email_message_vectorized_train.shape

(444, 1385)

In [None]:
# Naive Bayes model For Email Domain Classification

NB_stats = NB_model.predict(email_dom_vectorized_test)

NB_predictions = NB_model.predict_proba(email_dom_vectorized_test)[:, 1]

In [None]:
NB_stats = NB_model.predict(email_dom_vectorized_test)
print(sk_metrics.classification_report(y_test, NB_stats))
print("Accuracy score:", sk_metrics.accuracy_score(y_test, NB_stats))
print()

              precision    recall  f1-score   support

           0       0.70      0.81      0.75        43
           1       0.87      0.78      0.82        69

    accuracy                           0.79       112
   macro avg       0.79      0.80      0.79       112
weighted avg       0.81      0.79      0.80       112

Accuracy score: 0.7946428571428571



In [None]:
ANN_predictions = (ANN_model.predict(email_message_vectorized_test) > 0.5).astype("int32")
ANN_predictions_proba = ANN_model.predict(email_message_vectorized_test)
print(sk_metrics.classification_report(y_test, ANN_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_test, ANN_predictions))

              precision    recall  f1-score   support

           0       1.00      0.33      0.49        43
           1       0.70      1.00      0.83        69

    accuracy                           0.74       112
   macro avg       0.85      0.66      0.66       112
weighted avg       0.82      0.74      0.70       112

Accuracy score: 0.7410714285714286


In [None]:
# Data for LR

temp_df = X_test.loc[:, ['valid_first_name', 'valid_phone_number']]
temp_df = temp_df.reset_index(drop=True)
temp_df = pd.concat([temp_df, pd.Series(ANN_predictions_proba.reshape(ANN_predictions_proba.shape[0],))], axis=1)
temp_df = pd.concat([temp_df, pd.Series(NB_predictions.reshape(NB_predictions.shape[0],))], axis=1)

temp_df.head()

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1
0,1,1,0.666321,0.190832
1,0,0,0.666321,0.993602
2,1,1,0.666321,0.190832
3,1,1,0.666321,0.016648
4,1,1,0.666321,0.190832


In [None]:
LR_predictions = LR_model.predict(temp_df)

print(sk_metrics.classification_report(y_test, LR_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_test, LR_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        69

    accuracy                           1.00       112
   macro avg       1.00      1.00      1.00       112
weighted avg       1.00      1.00      1.00       112

Accuracy score: 1.0




In [None]:
inspect_df = pd.concat([X_test.reset_index(drop=True), pd.Series(LR_predictions)], axis=1)
inspect_df.head()

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message,0
0,Ковалевская Оксана,1,682657858,1,oksana.kovalevskaya.1976@gmail.com,gmail.com,,0
1,Rebeccateemi,0,83441659161,0,akovchenko@yandex.ru,yandex.ru,,1
2,Надежда Ирина,1,688183578,1,gnadyezhda@gmail.com,gmail.com,,0
3,Бодоса Ольга,1,668210251,1,beladonna195515@ukr.net,ukr.net,,0
4,Будько Андрей,1,635024068,1,budko16103@gmail.com,gmail.com,,0


# **Testing the model on hand-crafted data**

In [None]:
user_dict = {'name': ['Bob', 'Михаил', 'влоаідоів', 'Jack', '', 'Тоня'],	
             'phone': ['', "", "0968378580", '','', '0968467563'],	
             'email': ['jeka@gmail.com', 'miha@gmail.com', '', 'blabla@gmail.com', '', ''],
             'message': ['увеличение размера органов купите купите купите реклама', 
                         'хочу записаться на курсы', 'купите трансплантация уеуеуеуеуеу',
                         'купите бойлер установки лучшее предложение',
                         'быстрый заработок нужно только ничего не делать и перейти по ссылке',
                         '']}

user_df = pd.DataFrame(user_dict)
user_df

Unnamed: 0,name,phone,email,message
0,Bob,,jeka@gmail.com,увеличение размера органов купите купите купит...
1,Михаил,,miha@gmail.com,хочу записаться на курсы
2,влоаідоів,968378580.0,,купите трансплантация уеуеуеуеуеу
3,Jack,,blabla@gmail.com,купите бойлер установки лучшее предложение
4,,,,быстрый заработок нужно только ничего не делат...
5,Тоня,968467563.0,,


In [None]:
# Constructing Features

user_df.insert(loc=3, column='email_domain', 
                               value=user_df['email'].apply(lambda x: x.lower().split('@')[-1].strip()))

user_df.insert(loc=1, column='valid_first_name', 
                               value=user_df.name.astype('str').apply(lambda x: validate_name(x, names_list)))

user_df.insert(loc=3, column='valid_phone_number', 
                               value=user_df.phone.astype('str').apply(lambda x: validate_phone_number(x)))

In [None]:
user_df

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message
0,Bob,1,,0,jeka@gmail.com,gmail.com,увеличение размера органов купите купите купит...
1,Михаил,1,,0,miha@gmail.com,gmail.com,хочу записаться на курсы
2,влоаідоів,0,968378580.0,1,,,купите трансплантация уеуеуеуеуеу
3,Jack,1,,0,blabla@gmail.com,gmail.com,купите бойлер установки лучшее предложение
4,,0,,0,,,быстрый заработок нужно только ничего не делат...
5,Тоня,1,968467563.0,1,,,


In [None]:
email_dom_vectorized_prod = email_dom_vectorizer.transform(user_df.email_domain)

email_message_vectorized_prod = email_message_vectorizer.transform(user_df.message).toarray()

In [None]:
# Naive Bayes model For Email Domain Classification

NB_predictions = NB_model.predict_proba(email_dom_vectorized_prod)[:, 1]

In [None]:
ANN_predictions = ANN_model.predict(email_message_vectorized_prod)



In [None]:
# Data for LR

temp_df = user_df.loc[:, ['valid_first_name', 'valid_phone_number']]
temp_df = temp_df.reset_index(drop=True)
temp_df = pd.concat([temp_df, pd.Series(ANN_predictions.reshape(ANN_predictions.shape[0],))], axis=1)
temp_df = pd.concat([temp_df, pd.Series(NB_predictions.reshape(NB_predictions.shape[0],))], axis=1)

temp_df.head()

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1
0,1,0,1.0,0.190832
1,1,0,1.570005e-07,0.190832
2,0,1,0.987552,0.614865
3,1,0,0.9998811,0.190832
4,0,0,0.9330406,0.614865


In [None]:
LR_predictions = LR_model.predict(temp_df)



In [None]:
LR_predictions_probabilities = LR_model.predict_proba(temp_df)[:, 1]



In [None]:
temp_df = pd.concat([temp_df, pd.Series(LR_predictions)], axis=1)
temp_df = pd.concat([temp_df, pd.Series(LR_predictions_probabilities)], axis=1)

In [None]:
temp_df

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1,0.2,0.3
0,1,0,1.0,0.190832,1,0.580216
1,1,0,1.570005e-07,0.190832,0,0.110358
2,0,1,0.987552,0.614865,1,0.710814
3,1,0,0.9998811,0.190832,1,0.580146
4,0,0,0.9330406,0.614865,1,0.985976
5,1,1,0.6663214,0.614865,0,0.04277
