<a href="https://colab.research.google.com/github/Gzaborey/python_applications_classifier/blob/main/%22Application_Classifier%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import string
import re
import sklearn.model_selection as sk_ms
import sklearn.preprocessing as sk_preprocessing
import sklearn.metrics as sk_metrics
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Defining Functions

def process_text(text):
  """Removes punctuation, whitespases, stopwords, english words, short words and digits.
     Returns list of words."""

  stemmer = SnowballStemmer('russian')
  eng_regex = re.compile(r'[a-zA-Z]')

  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  processed_words = [stemmer.stem(word.lower()) for word in nopunc.split()\
                      if word.lower() not in stopwords.words('russian')\
                      and word.lower() not in stopwords.words('english')\
                      and word.lower().isalpha()\
                      and len(word.lower()) > 4\
                      and word.lower() not in ''.join(eng_regex.findall(word.lower()))]
  return processed_words 

def validate_name(string_to_validate, name_data):
  name_validity = 0
  for word in string_to_validate.lower().split():
    if word in name_data:
      name_validity = 1
      break
    else:
      name_validity = 0
  return name_validity

def validate_phone_number(phone_number):
  temp1 = [char for char in phone_number if (char not in string.punctuation \
                                             and char is not ' ')]
  temp2 = ''.join(temp1)
  if len(temp2) > 12 or len(temp2) < 9:
    return 0
  else:
    for char in range(7):
      temp1.pop()
    temp2 = ''.join(temp1)
    pattern1 = re.compile(r'(380|0|)')
    first_condition = temp2[-3::-1][::-1] in pattern1.findall(temp2[-3::-1][::-1])
    pattern2 = re.compile(r'(39|67|68|96|97|98|50|66|95|99|63|93|91|92|94)')
    second_condition = temp2[-1:-3:-1][::-1] in pattern2.findall(temp2[-1:-3:-1][::-1])
    if first_condition and second_condition:
      return 1
    else:
      return 0

In [5]:
email_data = pd.read_excel('https://github.com/Gzaborey/python_applications_classifier/blob/main/email_data.xlsx?raw=true')
email_data.drop(['Unnamed: 0'], axis=1, inplace=True)
email_data = email_data.sample(frac=1)
email_data = email_data.reset_index(drop=True)
email_data.head(10)

Unnamed: 0,name,phone,email,add,is_spam
0,Русланова Валерия,506842677,valeri_999@ukr.net,,0
1,lakishark16,85915812931,feliciajz5@akira31.forcemix.online,,1
2,Fomin Aleksandr Alekseevich,674812290,aleksandr.fomin@gmail.com,Тест,0
3,conttzetm,87353865148,kovapevaelja1994@rambler.ru,Здравствуйте! \n \nРазошлём ваши коммерческие ...,1
4,vselediNoimi,83849421382,vitri.com.ua@yandex.ru,,1
5,RubenMew,85385837717,bbbbbbbbbbbbbb@box.it,:::::::::::::::: ONLY THE BEST :::::::::::::::...,1
6,NatalieVaw,82892593229,ksolvary@bk.ru,,1
7,Бабаян Олександр,380955000000,pengvinthik@gmail.com,Хочу записаться,0
8,Янчук Наталия,+38 (095) 337-22-90,n.2015@ukr.net,,0
9,Мирослава Білінська,380680000000,murosia220294@gmail.com,,0


# **Preparing the data**

## Loading the data and constructing features

In [8]:
# Loading Data

email_data = pd.read_excel('https://github.com/Gzaborey/python_applications_classifier/blob/main/email_data.xlsx?raw=true')
email_data = email_data.drop(['viber', 'telegram', 'datetime', 'age', 'Unnamed: 0'], axis=1, errors='ignore')
email_data = email_data.drop_duplicates()
email_data = email_data.reset_index(drop=True)
email_data = email_data.rename(columns={'add': 'message'})
email_data['message'] = email_data['message'].astype('str')

In [9]:
email_data.head()

Unnamed: 0,name,phone,email,message,is_spam
0,Виктор Сперанский,380936000000,speranskiyva@ukr.net,Проверка,0
1,11,111,1111@gmail.com,,1
2,Fomin Aleksandr Alekseevich,674812290,aleksandr.fomin@gmail.com,111,0
3,Fomin,674812290,aleksandr.fomin@gmail.com,111,0
4,3,5,as@as.as,,1


In [10]:
# Observing Data

print(email_data['is_spam'].value_counts())
print()
print(email_data.isna().sum())
print()
print(email_data.dtypes)

1    342
0    214
Name: is_spam, dtype: int64

name        1
phone       1
email      36
message     0
is_spam     0
dtype: int64

name       object
phone      object
email      object
message    object
is_spam     int64
dtype: object


In [11]:
# Filling Missing Values

email_data = email_data.fillna('')
email_data['message'] = email_data['message'].replace('nan', '')
print(email_data.isna().sum())
print()

name       0
phone      0
email      0
message    0
is_spam    0
dtype: int64



In [12]:
email_data.head()

Unnamed: 0,name,phone,email,message,is_spam
0,Виктор Сперанский,380936000000,speranskiyva@ukr.net,Проверка,0
1,11,111,1111@gmail.com,,1
2,Fomin Aleksandr Alekseevich,674812290,aleksandr.fomin@gmail.com,111,0
3,Fomin,674812290,aleksandr.fomin@gmail.com,111,0
4,3,5,as@as.as,,1


In [14]:
# Loading Dataframe Of Common Names

names_data = pd.read_csv('https://raw.githubusercontent.com/Gzaborey/python_applications_classifier/main/names_dataframe.csv', index_col='Unnamed: 0')
names_list = [str(name).lower() for name in names_data.iloc[:, -1]]

In [None]:
len(names_list)

97974

In [15]:
# Constructing Features

email_data.insert(loc=3, column='email_domain', 
                               value=email_data['email'].apply(lambda x: x.lower().split('@')[-1].strip()))

email_data.insert(loc=1, column='valid_first_name', 
                               value=email_data.name.astype('str')\
                              .apply(lambda x: validate_name(x, names_list)))

email_data.insert(loc=3, column='valid_phone_number', 
                               value=email_data.phone.astype('str').apply(lambda x: validate_phone_number(x)))

## Dividing the data into test and train sets

In [16]:
# Shuffling DataFrame Rows 

email_data = email_data.sample(frac=1)
email_data = email_data.reset_index(drop=True)

X = email_data.drop('is_spam', axis=1)
y = email_data.is_spam

email_data.head(4)

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message,is_spam
0,DonaldNit,0,85963869562,0,uasports@rambler.ru,rambler.ru,"Спасидо, +",1
1,RobertFEw,0,83734362685,0,g.nalivkin@max.enersets.com,max.enersets.com,,1
2,Fomin Aleksandr Alekseevich,1,674812290,1,aleksandr.fomin@gmail.com,gmail.com,TEST2,0
3,Дожа Ирина,1,637877096,1,dozhairene2008@i.ua,i.ua,,0


In [17]:
# Spliting Data Into Train And Test Sets

X_train, X_test, y_train, y_test = sk_ms.train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

X_train.head(10)

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message
0,eqifiqzucepe,0,88694529469,0,agjabug@fhfsd.popmile45.com,fhfsd.popmile45.com,
1,foligaa,0,81345913666,0,smsuhka@mail.ru,mail.ru,"Заказать seo поисковую оптимизацию сайта, Зака..."
2,monyya,0,88658147571,0,shisha71@list.ru,list.ru,"Заказать seo поисковую оптимизацию сайта, Зака..."
3,hydra-Orems,0,88893667789,0,3.6.487djv.h@gmail.com,gmail.com,site-hydra.net
4,iyaloratebi,0,84848467549,0,unkusayi@pazew.fodiscomail.com,pazew.fodiscomail.com,
5,Hrxokhv,0,81445143366,0,ztry@realogy.com\n,realogy.com,Jhdahyh \n \nsr@crb-us.com\n :
6,olyshka,0,88151848742,0,32galy@mail.ru,mail.ru,
7,SSavv,0,82819984212,0,prokkha@mail.ru,mail.ru,
8,Спирина Юлия,1,+38(098) 245-23-44,1,spirina.yulia2712@gmail.com,gmail.com,
9,Александр Гандзий,1,+38(097)-85-52-237,1,aleksandergandziy1990@gmail.com,gmail.com,


# **Building and training the model**

In [18]:
email_dom_vectorizer = CountVectorizer()
email_dom_vectorizer.fit(email_data.email_domain)
email_dom_vectorized_train = email_dom_vectorizer.transform(X_train.email_domain)

email_message_vectorizer = CountVectorizer(analyzer=process_text)
email_message_vectorizer.fit(email_data.message)
email_message_vectorized_train = email_message_vectorizer.transform(X_train.message).toarray()

email_message_vectorized_train.shape

(444, 1385)

In [None]:
 #email_message_vectorizer.get_feature_names()

In [19]:
# Naive Bayes model For Email Domain Classification

NB_model = MultinomialNB()
NB_model.fit(email_dom_vectorized_train, y_train)

NB_stats = NB_model.predict(email_dom_vectorized_train)

NB_predictions = NB_model.predict_proba(email_dom_vectorized_train)[:, 1]

In [20]:
print(sk_metrics.classification_report(y_train, NB_stats))
print("Accuracy score:", sk_metrics.accuracy_score(y_train, NB_stats))
print()

              precision    recall  f1-score   support

           0       0.75      0.83      0.79       171
           1       0.89      0.83      0.86       273

    accuracy                           0.83       444
   macro avg       0.82      0.83      0.82       444
weighted avg       0.83      0.83      0.83       444

Accuracy score: 0.8288288288288288



In [21]:
# ANN for Message Content Classification

ANN_model = Sequential()
ANN_model.add(Dense(128, input_dim=email_message_vectorized_train.shape[1], activation='relu'))
ANN_model.add(Dense(64, activation='relu'))
ANN_model.add(Dropout(0.2))
ANN_model.add(Dense(32, activation='relu'))
ANN_model.add(Dense(16, activation='relu'))
ANN_model.add(Dropout(0.2))
ANN_model.add(Dense(8, activation='relu'))
ANN_model.add(Dense(1, activation='sigmoid'))

In [22]:
ANN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
ANN_model.fit(email_message_vectorized_train, y_train, epochs=100, batch_size=20, verbose=0)

<keras.callbacks.History at 0x7f521e4f8e10>

In [23]:
ANN_predictions = (ANN_model.predict(email_message_vectorized_train) > 0.5).astype("int32")
ANN_predictions_proba = ANN_model.predict(email_message_vectorized_train)
print(sk_metrics.classification_report(y_train, ANN_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_train, ANN_predictions))

              precision    recall  f1-score   support

           0       1.00      0.39      0.56       171
           1       0.72      1.00      0.84       273

    accuracy                           0.77       444
   macro avg       0.86      0.70      0.70       444
weighted avg       0.83      0.77      0.73       444

Accuracy score: 0.7657657657657657


In [24]:
# Final Logistic Regression Classifier

LR_model = LogisticRegression(solver='liblinear')

In [25]:
# Data for LR

temp_df = X_train.loc[:, ['valid_first_name', 'valid_phone_number']]
temp_df = pd.concat([temp_df, pd.Series(ANN_predictions_proba.reshape(ANN_predictions_proba.shape[0],))], axis=1)
temp_df = pd.concat([temp_df, pd.Series(NB_predictions.reshape(NB_predictions.shape[0],))], axis=1)

temp_df.head()

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1
0,0,0,0.676354,0.837082
1,0,0,1.0,0.993987
2,0,0,1.0,0.991818
3,0,0,0.676354,0.184309
4,0,0,0.676354,0.793965


In [26]:
LR_model.fit(temp_df, y_train)



LogisticRegression(solver='liblinear')

In [27]:
LR_predictions = LR_model.predict(temp_df)

print(sk_metrics.classification_report(y_train, LR_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_train, LR_predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       171
           1       0.99      1.00      0.99       273

    accuracy                           0.99       444
   macro avg       0.99      0.99      0.99       444
weighted avg       0.99      0.99      0.99       444

Accuracy score: 0.9932432432432432




In [28]:
inspect_df = pd.concat([X_train, pd.Series(LR_predictions)], axis=1)
inspect_df.head()

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message,0
0,eqifiqzucepe,0,88694529469,0,agjabug@fhfsd.popmile45.com,fhfsd.popmile45.com,,1
1,foligaa,0,81345913666,0,smsuhka@mail.ru,mail.ru,"Заказать seo поисковую оптимизацию сайта, Зака...",1
2,monyya,0,88658147571,0,shisha71@list.ru,list.ru,"Заказать seo поисковую оптимизацию сайта, Зака...",1
3,hydra-Orems,0,88893667789,0,3.6.487djv.h@gmail.com,gmail.com,site-hydra.net,1
4,iyaloratebi,0,84848467549,0,unkusayi@pazew.fodiscomail.com,pazew.fodiscomail.com,,1


# **Testing model on test data set**

In [31]:
#Transforming test data

email_dom_vectorized_test = email_dom_vectorizer.transform(X_test.email_domain)

email_message_vectorized_test = email_message_vectorizer.transform(X_test.message).toarray()

email_message_vectorized_train.shape

(444, 1385)

In [32]:
# Naive Bayes model For Email Domain Classification

NB_stats = NB_model.predict(email_dom_vectorized_test)

NB_predictions = NB_model.predict_proba(email_dom_vectorized_test)[:, 1]

In [33]:
NB_stats = NB_model.predict(email_dom_vectorized_test)
print(sk_metrics.classification_report(y_test, NB_stats))
print("Accuracy score:", sk_metrics.accuracy_score(y_test, NB_stats))
print()

              precision    recall  f1-score   support

           0       0.65      0.84      0.73        43
           1       0.88      0.72      0.79        69

    accuracy                           0.77       112
   macro avg       0.77      0.78      0.76       112
weighted avg       0.79      0.77      0.77       112

Accuracy score: 0.7678571428571429



In [34]:
ANN_predictions = (ANN_model.predict(email_message_vectorized_test) > 0.5).astype("int32")
ANN_predictions_proba = ANN_model.predict(email_message_vectorized_test)
print(sk_metrics.classification_report(y_test, ANN_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_test, ANN_predictions))

              precision    recall  f1-score   support

           0       0.71      0.23      0.35        43
           1       0.66      0.94      0.78        69

    accuracy                           0.67       112
   macro avg       0.69      0.59      0.56       112
weighted avg       0.68      0.67      0.61       112

Accuracy score: 0.6696428571428571


In [35]:
# Data for LR

temp_df = X_test.loc[:, ['valid_first_name', 'valid_phone_number']]
temp_df = temp_df.reset_index(drop=True)
temp_df = pd.concat([temp_df, pd.Series(ANN_predictions_proba.reshape(ANN_predictions_proba.shape[0],))], axis=1)
temp_df = pd.concat([temp_df, pd.Series(NB_predictions.reshape(NB_predictions.shape[0],))], axis=1)

temp_df.head()

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1
0,0,0,0.676354,0.184309
1,0,0,0.676354,0.986105
2,1,1,0.676354,0.013729
3,1,1,0.676354,0.184309
4,0,0,0.676354,0.818083


In [36]:
LR_predictions = LR_model.predict(temp_df)

print(sk_metrics.classification_report(y_test, LR_predictions))
print("Accuracy score:", sk_metrics.accuracy_score(y_test, LR_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        69

    accuracy                           1.00       112
   macro avg       1.00      1.00      1.00       112
weighted avg       1.00      1.00      1.00       112

Accuracy score: 1.0




In [37]:
inspect_df = pd.concat([X_test.reset_index(drop=True), pd.Series(LR_predictions)], axis=1)
inspect_df.head()

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message,0
0,samuelMus,0,88454453434,0,m.ich.e.l.du.sten.wall.st.r.ee.t.7.43.@gmail.com,gmail.com,,1
1,alixuli,0,88139113122,0,oguvapume@otnis.jonjamail.com,otnis.jonjamail.com,,1
2,Малиновский Александр,1,934437534,1,megapover17@ukr.net,ukr.net,,0
3,Кучеренко Руслан,1,669087208,1,kmr200116@gmail.com,gmail.com,,0
4,tulttaedvs,0,12134251453,0,f58c212a7422f9b9ce584fae9d34f26f.roopert@ssema...,ssemarket,Muchas gracias. ?Como puedo iniciar sesion?,1


# **Testing on hand-crafted data**

In [38]:
user_dict = {'name': ['Bob', 'Михаил', 'влоаідоів', 'Jack', '', 'Тоня'],	
             'phone': ['', "", "0968378580", '','', '0968467563'],	
             'email': ['jeka@gmail.com', 'miha@gmail.com', '', 'blabla@gmail.com', '', ''],
             'message': ['увеличение размера органов купите купите купите реклама', 
                         'хочу записаться на курсы', 'купите трансплантация уеуеуеуеуеу',
                         'купите бойлер установки лучшее предложение',
                         'быстрый заработок нужно только ничего не делать и перейти по ссылке',
                         '']}

user_df = pd.DataFrame(user_dict)
user_df

Unnamed: 0,name,phone,email,message
0,Bob,,jeka@gmail.com,увеличение размера органов купите купите купит...
1,Михаил,,miha@gmail.com,хочу записаться на курсы
2,влоаідоів,968378580.0,,купите трансплантация уеуеуеуеуеу
3,Jack,,blabla@gmail.com,купите бойлер установки лучшее предложение
4,,,,быстрый заработок нужно только ничего не делат...
5,Тоня,968467563.0,,


In [39]:
# Constructing Features

user_df.insert(loc=3, column='email_domain', 
                               value=user_df['email'].apply(lambda x: x.lower().split('@')[-1].strip()))

user_df.insert(loc=1, column='valid_first_name', 
                               value=user_df.name.astype('str').apply(lambda x: validate_name(x, names_list)))

user_df.insert(loc=3, column='valid_phone_number', 
                               value=user_df.phone.astype('str').apply(lambda x: validate_phone_number(x)))

In [40]:
user_df

Unnamed: 0,name,valid_first_name,phone,valid_phone_number,email,email_domain,message
0,Bob,1,,0,jeka@gmail.com,gmail.com,увеличение размера органов купите купите купит...
1,Михаил,1,,0,miha@gmail.com,gmail.com,хочу записаться на курсы
2,влоаідоів,0,968378580.0,1,,,купите трансплантация уеуеуеуеуеу
3,Jack,1,,0,blabla@gmail.com,gmail.com,купите бойлер установки лучшее предложение
4,,0,,0,,,быстрый заработок нужно только ничего не делат...
5,Тоня,1,968467563.0,1,,,


In [41]:
email_dom_vectorized_prod = email_dom_vectorizer.transform(user_df.email_domain)

email_message_vectorized_prod = email_message_vectorizer.transform(user_df.message).toarray()

In [42]:
# Naive Bayes model For Email Domain Classification

NB_predictions = NB_model.predict_proba(email_dom_vectorized_prod)[:, 1]

In [43]:
ANN_predictions = ANN_model.predict(email_message_vectorized_prod)



In [44]:
# Data for LR

temp_df = user_df.loc[:, ['valid_first_name', 'valid_phone_number']]
temp_df = temp_df.reset_index(drop=True)
temp_df = pd.concat([temp_df, pd.Series(ANN_predictions.reshape(ANN_predictions.shape[0],))], axis=1)
temp_df = pd.concat([temp_df, pd.Series(NB_predictions.reshape(NB_predictions.shape[0],))], axis=1)

temp_df.head()

Unnamed: 0,valid_first_name,valid_phone_number,0,0.1
0,1,0,0.9999959,0.184309
1,1,0,9.390141e-11,0.184309
2,0,1,0.983516,0.614865
3,1,0,0.9994766,0.184309
4,0,0,0.05964617,0.614865


In [45]:
LR_predictions = LR_model.predict(temp_df)



In [46]:
LR_predictions

array([1, 0, 1, 1, 1, 0])

In [47]:
LR_model.predict_proba(temp_df)[:, 1]



array([0.55324156, 0.09869133, 0.69165496, 0.55293018, 0.8876849 ,
       0.04087804])