# Import libraries

In [None]:
import os
import pandas as pd
import numpy as np
import collections
import re
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

nltk and gensim are two famous libraries that are used in Natural Language Processing (NLP). nltk library has been used to get the stop words of English language and to lemmatize words. Also gensim library has been used to load the Doc2vec model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
proj_dir='/content/drive/MyDrive/Colab Notebooks/doc2vec/' #give your project directory here. data sets should be in this location

# Load and clean the data

before converting into vectors text should be cleaned.

* Lower the english latters
* Remove headers
* Drop all digits
* Remove URLs and Emails
* Drop all punctuation from our text
* Drop stop words
* Lemmatize words


first, we convert all the letters into lowercase letters in order to avoid duplicating the same word ('Dog' and 'dog' are the same words. but the program will take these as two words. Hence we convert 'Dog' as 'dog' by doing letters lowercase)

After that we drop all the headers and numbers.Because headers are unwanted data and numbers don't have any meaning when we take them as a single word.

if any word contains '@','.com' or 'http' we drop those words. also we remove all punctuations and stop words. The words which are generally filtered out before processing a natural language are called stop words. These are actually the most common words such as “the”, “a”, “an”, “so”, “what” in English language. By removing these words, we remove the low-level information from our text in order to give more focus to the important information.

finally we lemmatize words. Lemmatization is the process wherein the context is used to convert a word to its meaningful base or root form. ex:- driving--> drive , dogs-->dog. To perform Lemmatization with Natural Language Tool Kit (NLTK), “WordNetLemmatizer()” method has been used

In [None]:
'''this function is used to read text files.'''
def read_txt_file(file_name):
    with open(file_name,encoding="utf8", errors='ignore') as f:
        ###extract the boady of the text###
        line = f.readline()
        txt=''
        txt=txt+' '+line
        while line:
            line = f.readline()
            txt=txt+' '+line
        ###################################
    f.close()
    return(txt)

In [None]:
stop_words=set(stopwords.words('english'))#load stop words
punctuations=string.punctuation #get punctuations
lemmatizer = WordNetLemmatizer()
'''this function is used to clean text'''
def clean_txt(txt):
    txt=txt.lower() #set all characters to lowercase
    sentences=txt.split('\n')
    txt = ' '.join([i for i in sentences if not ':' in i])#remove headers
    txt = ''.join([i for i in txt if not i.isdigit()])#remove numbers

    ###remove urls and emails###
    words=txt.split()
    txt = ' '.join([i for i in words if not '@' in i and not '.com' in i and not  'http:' in i])
    #######################################

    ###remove punctuations###
    for character in punctuations:
        txt = txt.replace(character, '')
    #########################################
    
    ###remove stop words and lemmatize###
    words=txt.split()
    filtered_txt = ' '.join([lemmatizer.lemmatize(i) for i in words if not i in stop_words])
    #####################################
    
    return(filtered_txt)

In [None]:
def load_and_clean_data(location):    
    y=os.listdir(location)#get the list of folder
    txts=[]
    txts_cleaned=[]
    folder_array=[]
    file_array=[]
    for i in range(len(y)):
        text_file_names=os.listdir(location+'/'+y[i]) #get the list of files
        for text_file_name in text_file_names:
                file_array.append(text_file_name)
                txt=read_txt_file(location+'/'+y[i]+'/'+text_file_name) #read the text file
                txts.append(txt)
                txts_cleaned.append(clean_txt(txt)) #clean the text
                folder_array.append(y[i])

    ###create a data frame###
    df=pd.DataFrame()
    df['texts']=txts
    df['text cleaned']=txts_cleaned
    df['folder name']=folder_array
    df['file name']=file_array
    ########################
    return (df)



In [None]:
df_train=load_and_clean_data(proj_dir+'20news-bydate-train')
df_test=load_and_clean_data(proj_dir+'20news-bydate-test')

KeyboardInterrupt: ignored

In [None]:
df_train.head()

In [None]:
df_test.head()

# Convert to vectors

document is tokenized into words and applied doc2vec model. doc2vec is a method to represent list of words using a vector. it is used to create a vectorised representation of a group of words taken collectively as a single unit.In gensim the model will always be trained on a word per word basis. Therefore  we split the document into an array of words using split(). In order to train the model, tagged documents are needed. it can be created by using models.doc2vec.TaggedDcument(). then finally we train the doc2vec model

In [None]:
'''this function is used to do tokenization'''
def tokenizer(txt):  
    tokens=txt.split(' ')
    unique_tokens=np.unique(np.array(tokens)) #get unique tokens
    ###create a dictonary of tokens###
    tokens_dict={}
    for indx in range(len(unique_tokens)):
        tokens_dict[unique_tokens[indx]]=indx
    return(tokens_dict,tokens)

In [None]:
#tokanize train and test data
words_list=[]
for i in range(len(df_train)):
    _,words=tokenizer(df_train['text cleaned'][i]) 
    words_list.append(words)
for i in range(len(df_test)):
    _,words=tokenizer(df_test['text cleaned'][i])
    words_list.append(words)
####################################

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(words_list)]
model = Doc2Vec(vector_size=40, min_count=2, epochs=30) #fit the Doc2Vec model
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
###get train vectors##
train_vectors=[]
for i in range(len(df_train)):
    train_vectors.append(model.infer_vector(list(df_train['text cleaned'][i].split(' '))))
##########################

In [None]:
##get test vectors##
test_vectors=[]
for i in range(len(df_test)):
    test_vectors.append(model.infer_vector(list(df_test['text cleaned'][i].split(' '))))
############################

In [None]:
train=pd.DataFrame(train_vectors)
train['folder name']=df_train['folder name']
train['file name']=df_train['file name']
test=pd.DataFrame(test_vectors)
test['folder name']=df_test['folder name']
test['file name']=df_test['file name']

In [None]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,folder name,file name
0,-0.933205,-0.178977,-0.261009,0.522588,-1.079097,0.142576,0.162591,-0.868961,0.303258,0.526063,...,-0.548832,0.160508,-1.081549,-0.092147,-0.188266,0.078702,0.110594,0.436555,comp.windows.x,66939
1,-0.943828,-0.387731,0.812329,0.069872,-0.979679,-0.230455,0.789759,-0.558625,0.368101,0.619312,...,0.841802,-0.288762,-0.511047,0.315134,0.98187,-0.017787,-1.30945,0.418268,comp.windows.x,67117
2,-0.946725,0.264787,-0.093094,0.776043,-0.100243,-0.666881,0.893518,-0.509928,0.30841,0.140922,...,-0.38882,0.519086,-0.018635,0.497458,0.166446,0.774613,-0.303735,-0.210747,comp.windows.x,66947
3,-0.825486,-0.158539,-0.047129,0.33653,-0.584075,0.214128,0.320981,-0.165183,0.238848,0.1138,...,-0.228607,0.207944,-0.01802,0.05475,-0.249475,0.195008,-0.359494,-0.102604,comp.windows.x,67279
4,-1.24521,0.604609,0.218405,0.284372,-1.356997,-0.581402,0.701923,-0.781762,0.198173,0.474908,...,0.317477,-1.141635,-1.544908,1.008704,0.396541,-0.674398,-0.704394,0.311889,comp.windows.x,67410


In [None]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,folder name,file name
0,-0.550795,-0.185601,0.179597,0.160102,-0.774041,0.026048,0.101547,-0.618309,0.498451,0.205104,...,-0.200693,0.058638,-0.457904,-0.095728,0.491058,0.255946,-0.680001,0.517784,comp.windows.x,67551
1,-0.284087,-0.115187,0.030702,0.28132,-0.407325,-0.114323,0.454607,0.235487,-0.010821,-0.014563,...,0.268885,0.061176,-0.044732,-0.282029,0.264577,0.162512,-0.086966,0.012738,comp.windows.x,67566
2,-0.75304,-0.321237,0.587022,0.506401,-1.301483,-0.676634,0.877525,-1.028694,0.880936,0.481587,...,0.720471,-0.234179,-0.35636,-0.316403,0.765838,-0.33701,-0.725563,-0.029713,comp.windows.x,68196
3,-2.778311,1.280611,-1.136019,1.160854,-2.688332,-8.161446,2.877449,-2.769453,-1.838291,1.585129,...,0.298002,-1.080549,-3.076638,4.455657,0.344549,-1.086496,-1.242209,-1.996426,comp.windows.x,68332.eml
4,-0.911315,0.11535,-0.346644,0.170139,-2.200856,-0.641439,0.945493,0.113511,0.979531,0.814421,...,-0.274061,-0.27223,-0.439384,1.056096,0.299681,-1.047593,-0.787903,0.248227,comp.windows.x,67972


In [None]:
train_,validation=train_test_split( train, test_size=0.33, random_state=42) #split data into train and validation sets

In [None]:
len(train_),len(test),len(validation)

(7580, 7537, 3734)

In [None]:
#save data sets
train_.to_csv(proj_dir+'train_data.csv')
test.to_csv(proj_dir+'test_data.csv')
validation.to_csv(proj_dir+'validation_data.csv')

# Define independent variables and taget variable

In [2]:
import pandas as pd

# load data
train = pd.read_csv("train_data.csv", index_col=[0])
validation = pd.read_csv("validation_data.csv", index_col=[0])
test = pd.read_csv("test_data.csv", index_col=[0])

# combine validation and training sets to do grid search
train_val = pd.concat([train, validation], axis = 0)
train_val = train_val.reset_index(drop = True)
X_train_val = train_val.iloc[:,0:40]
y_train_val = train_val["folder name"]

# specify x and y
X_train = train.iloc[:,0:40]
y_train = train["folder name"]
X_test = test.iloc[:,0:40]
y_test = test["folder name"]
X_val = validation.iloc[:,0:40]
y_val = validation["folder name"]

# Naive Bayes

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [13]:
# Create a list where train data indices are -1 and validation data indices are 0
split_index = [-1] * len(X_train) + [0] * len(X_val)
pds = PredefinedSplit(test_fold = split_index)

gnb_clf = GaussianNB()
parameters = {
    'var_smoothing': [1e-1,1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15]
}
clf = GridSearchCV(gnb_clf, parameters, cv=pds, verbose=1, n_jobs=1) #it is hold-out validation here
clf.fit(X_train_val, y_train_val)

print('Best parameter:',clf.best_params_) #best performance on training set and its parameter
print('Best train accuracy score:', clf.best_score_)

Fitting 1 folds for each of 15 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best parameter: {'var_smoothing': 0.0001}
Best train accuracy score: 0.386448848419925


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.5s finished


In [14]:
#grid search performance
pd.concat([pd.DataFrame(clf.cv_results_["params"]),pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

Unnamed: 0,var_smoothing,Accuracy
0,0.1,0.376272
1,0.01,0.385645
2,0.001,0.386181
3,0.0001,0.386449
4,1e-05,0.386449
5,1e-06,0.386449
6,1e-07,0.386449
7,1e-08,0.386449
8,1e-09,0.386449
9,1e-10,0.386449


In [24]:
#performance on test set
nb_pred = clf.best_estimator_.predict(X_test)
print('Test accuracy score:', accuracy_score(y_test, nb_pred))\

macro_averaged_precision = precision_score(y_test, nb_pred, average = 'macro')
print('Test macro averaged precision score:', macro_averaged_precision)

recall = recall_score(y_test, nb_pred, average = 'macro')
print('Test recall score:', recall)

macro_averaged_f1 = f1_score(y_test, nb_pred, average = 'macro')
print('Test macro averaged f1 score:', macro_averaged_f1)

Test accuracy score: 0.3830436513201539
Test macro averaged precision score: 0.4689854187857151
Test recall score: 0.37356957129115104
Test macro averaged f1 score: 0.38210340584727925


Macro averaged precision: calculate precision for all classes individually and then average them.

# KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
k_range = list(range(1, 31)) # possible k

param_grid = dict(n_neighbors=k_range)

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, param_grid, cv=pds, verbose=1, n_jobs=1)
  
# fitting the model for grid search
clf =clf.fit(X_train_val, y_train_val)
print('Best parameter:',clf.best_params_) #best performance on training set and its parameter
print('Best train accuracy score:', clf.best_score_)

Fitting 1 folds for each of 30 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best parameter: {'n_neighbors': 19}
Best train accuracy score: 0.5543652919121586


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished


In [27]:
#grid search performance
pd.concat([pd.DataFrame(clf.cv_results_["params"]),pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

Unnamed: 0,n_neighbors,Accuracy
0,1,0.545795
1,2,0.491966
2,3,0.518211
3,4,0.526245
4,5,0.535351
5,6,0.535351
6,7,0.539636
7,8,0.538297
8,9,0.544189
9,10,0.549009


In [28]:
#performance on test set
knn_pred = clf.best_estimator_.predict(X_test)
print('Test accuracy score:', accuracy_score(y_test, knn_pred))\

macro_averaged_precision = precision_score(y_test, knn_pred, average = 'macro')
print('Test macro averaged precision score:', macro_averaged_precision)

recall = recall_score(y_test, knn_pred, average = 'macro')
print('Test recall score:', recall)

macro_averaged_f1 = f1_score(y_test, knn_pred, average = 'macro')
print('Test macro averaged f1 score:', macro_averaged_f1)

Test accuracy score: 0.5194374419530318
Test macro averaged precision score: 0.5628429437813727
Test recall score: 0.5073400289862244
Test macro averaged f1 score: 0.5143767216038445
