In [1]:
import pandas as pd
import import_data
import preprocessing
import feature_extraction
import dimension_reduction
import classification

[nltk_data] Downloading package wordnet to /home/makus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/makus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# load dataset

In [2]:
lingspam = import_data.create_df_from_csv('/home/makus/Documents/Semester_7_WIN2022/Bachelorarbeit/ba_code/datasets/messages.csv')
lingspam

Unnamed: 0,text,label
0,job posting - apple-iss research centercontent...,0
1,"Subjectlang classification grimes , joseph e ....",0
2,query : letter frequencies for text identifica...,0
3,riska colleague and i are researching the diff...,0
4,request book informationearlier this morning i...,0
...,...,...
2888,love your profile - ysuolvpvhello thanks for s...,1
2889,you have been asked to join kiddinthe list own...,1
2890,anglicization of composers ' namesjudging from...,0
2891,"re : 6 . 797 , comparative method : n - ary co...",0


# Preprocessing

to illustrate the effect of our preprocessing we create a copy of the df and compare it later on with the original

In [3]:
lingspam_mails = lingspam['text']
lingspam_mails_clean = preprocessing.preprocess_only_text_format(lingspam['text'])

[nltk_data] Downloading package wordnet to /home/makus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/makus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


It took 31.020194115000777 second(s) to finish preprocessing.


# Feature Extraction with Doc2Vec

In [4]:
print('Preprocessed data:')
dv_mails = feature_extraction.doc2vec_vectorize(lingspam_mails_clean,100)
print(dv_mails.shape)
print("Raw data:")
dv_raw_mails = feature_extraction.doc2vec_vectorize(lingspam_mails,100)
print(dv_raw_mails.shape)

Preprocessed data:
It took 0.8734075510001276 second(s) to create the model and build the vocabulary.
It took 108.20169981502113 second(s) to finish doc2vec.
(2893, 100)
Raw data:
It took 1.733174824010348 second(s) to create the model and build the vocabulary.
It took 198.54579203401227 second(s) to finish doc2vec.
(2893, 100)


Cause of predefined low dimension, we don't need dimension reduction(emprically tested: more dimensions does not increase performance of the experiment)

# Feature Extraction with TF-IDF

In [5]:
print("Preprocessed data:")
tfidf_mails = feature_extraction.tfidf_vectorize(lingspam_mails_clean)
tfidf_mails.shape
print("Raw data:")
tfidf_raw_mails = feature_extraction.tfidf_vectorize(lingspam_mails)
tfidf_raw_mails.shape

Preprocessed data:
It took 1.158311904000584 second(s) to finish TF-IDF feature extraction.
Raw data:
It took 1.6696016189816874 second(s) to finish TF-IDF feature extraction.


(2893, 29059)

# Dimension Reduction with the TF-IDF vectorized mails

## Random Projection

In [6]:
print("Preprocessed data:")
sparse_tfidf_mails = dimension_reduction.sparse_random_projection(tfidf_mails)
sparse_tfidf_mails.shape
print("Raw data:")
sparse_raw_tfidf_mails = dimension_reduction.sparse_random_projection(tfidf_raw_mails)
sparse_raw_tfidf_mails.shape

Preprocessed data:
It took 5.190513406996615 second(s) to finish dimension reduction with Sparse Random Projection.
Raw data:
It took 5.676838886021869 second(s) to finish dimension reduction with Sparse Random Projection.


(2893, 6831)

## IPCA

reduce the mails set to 100 dimensions for getting the same dimensions as we get in doc2vec

In [7]:
print("Preprocessed data:")
very_sparse_tfidf_mails = dimension_reduction.ipca(sparse_tfidf_mails,100)
very_sparse_tfidf_mails.shape
print("Raw data:")
very_sparse_raw_tfidf_mails = dimension_reduction.ipca(sparse_raw_tfidf_mails,100)
very_sparse_raw_tfidf_mails.shape

Preprocessed data:
It took 20.05183771700831 second(s) to finish dimension reduction with IPCA.
Raw data:
It took 27.459661603992572 second(s) to finish dimension reduction with IPCA.


(2893, 100)

Comment: The sparse random results gives for both inputs the same dimension, but there not equal.
TF-IDF+dimension reduction is faster than doc2vec and in both cases, the preprocessed and the not preprocessed dataset.
The Question is: which procedure performs better on classification?

# SVM classification for comparison

In [8]:
y = lingspam['label']
experiment_sets = [
    dv_mails, #preprocessed data, feature extraction with doc2vec
    dv_raw_mails, #not preprocessed data, feature extraction with doc2vec
    sparse_tfidf_mails, #preprocessed, tfidf, sparse random projection
    sparse_raw_tfidf_mails, #not preprocessed, tfidf, sparse random projection
    very_sparse_tfidf_mails, #preprocessed, tfidf, sparse random projection, ipca
    very_sparse_raw_tfidf_mails] #not-preprocessed, tfidf, sparse random projection, ipca

In [9]:
from time import perf_counter

i = 1
feature_extraction_report = pd.DataFrame(columns=['Set Index','Accuracy','F_one_Score','Time'])
for set in experiment_sets:
    start = perf_counter()
    report, X_test, y_test, y_pred = classification.svm(set,y,noisy = False)
    end = perf_counter()
    acc = report["accuracy"][0]
    f_one = report.get("0",{}).get("f1-score")
    time = end - start
    feature_extraction_report.loc[len(feature_extraction_report)] = [i, acc, f_one,time]
    i += 1

feature_extraction_report

Unnamed: 0,Set Index,Accuracy,F_one_Score,Time
0,1.0,0.73057,0.841463,0.51385
1,2.0,0.699482,0.819502,0.497961
2,3.0,0.943005,0.967294,71.801906
3,4.0,0.956822,0.975025,72.599672
4,5.0,0.941278,0.966337,0.349729
5,6.0,0.955095,0.974052,0.28634


Conclusion:
For our intended use the following method is recommended:

TF-IDF -> Random Projection -> Incremental PCA without preprocessing

It produces the best results when you include the computation speed.
For the rest of our experiment, we only use this method.