# **Train Data Analysis**

In [None]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Clean and Tokenize Train Data***

In [None]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

**Downloading the Training Dataset**

In [None]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset6f31ddd.zip
!unzip dataset6f31ddd.zip

In [None]:
### loading data from google colab

from google.colab import files    
uploaded = files.upload()

Saving test.csv to test.csv


In [None]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,File_ID,Is_Mixedup,Topic
0,0x20de0,NO,Animal Species
1,0x23264,NO,Plant Species
2,0x22e2e,NO,Animal Species
3,0x223b5,YES,Not Applicable
4,0x22bd4,NO,Animal Species


In [None]:
train_data_v1 = train_data
train_data_v1.head(10)

Unnamed: 0,File_ID,Is_Mixedup,Topic
0,0x20de0,NO,Animal Species
1,0x23264,NO,Plant Species
2,0x22e2e,NO,Animal Species
3,0x223b5,YES,Not Applicable
4,0x22bd4,NO,Animal Species
5,0x239af,NO,Plant Species
6,0x226d4,NO,Animal Species
7,0x22d33,NO,Plant Species
8,0x23e2c,NO,Animal Species
9,0x2277e,YES,Not Applicable


In [None]:
train_data.isnull().sum()

File_ID       0
Is_Mixedup    0
Topic         0
dtype: int64

In [None]:
train_data.shape[0]

12000

**Read the contents of the PDF files against the File IDs and append them in 'contents' column of test_data_v1**

In [None]:
!pip install PyPDF2
import PyPDF2
import warnings
!pip install textract
import textract 
warnings.filterwarnings("ignore",category=DeprecationWarning)
text_list = []
for i in range(train_data.shape[0]):
  file_id = train_data_v1['File_ID'][i]
  file_name = file_id + '.pdf'
  path = '/content/dataset/train/' + file_name
#Lets read our pdf for the manifesto using the PdfFileReader() function from the PyPDF2 which is a package for extracting document information such as title, author, number of pages,
#...., spliting documents page by page, merging page by page, etc.
#filename = '/content/dataset/train/0x205cc.pdf'
  open_path = open(path, 'rb')
  ind_zoology = PyPDF2.PdfFileReader(open_path)
  
  count = 0
  text  = ''

# Lets loop through, to read each page from the pdf file
  while(count < ind_zoology.numPages):
    # Get the specified number of pages in the document
    mani_page  = ind_zoology.getPage(count)
    # Process the next page
    count += 1
    # Extract the text from the page
    text += mani_page.extractText()

  if text != '':
    text = text
    text_list.append(text)
    
  else:
    textract.process(open_path, method='tesseract', encoding='utf-8', langauge='eng' )
    text_list.append(text)
train_data_v1['content'] = np.array(text_list)
#print(train_data.head(3))

In [None]:
print(train_data_v1.head())

   File_ID Is_Mixedup           Topic  \
0  0x20de0         NO  Animal Species   
1  0x23264         NO   Plant Species   
2  0x22e2e         NO  Animal Species   
3  0x223b5        YES  Not Applicable   
4  0x22bd4         NO  Animal Species   

                                             content  
0  Biological Research Labs\n Greens often combin...  
1  Biological Research Labs\n Attempts to remove ...  
2  Biological Research Labs\nA domestic or domest...  
3  Biological Research Labs\n Hover flies are als...  
4  Biological Research Labs\n However, he clearly...  


In [None]:
print(text)

***Train Data Cleaning and preprocessing***


In [None]:
import nltk
nltk.download('punkt')

custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["content"]

df_raw = train_data_v1
df = df_raw.copy()

for col in text_columns:
    df[col] = df[col].astype(str)

df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["File_ID","text", "tokens"]]
docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")
#print(docs)
#print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original dataframe: (8000, 4)
Pre-processed dataframe: (7750, 3)


In [None]:
df.head()

Unnamed: 0,File_ID,text,tokens
5873,0x23376,Biological Research Labs\n1016/j cub 2007 03 0...,"[biological, research, labs, 1016j, cub, pmid,..."
3100,0x21cda,Biological Research Labs\n1111/j 1096-0031 200...,"[biological, research, labs, 1111j, pdf, fullt..."
915,0x22428,Biological Research Labs\n Until the 13th cent...,"[biological, research, labs, 13th, century, to..."
5777,0x2105a,Biological Research Labs\n Until the 13th cent...,"[biological, research, labs, 13th, century, to..."
461,0x217e0,Biological Research Labs\n Until the 13th cent...,"[biological, research, labs, 13th, century, to..."


In [None]:
print(tokenized_docs)

***Generate Document Vectors***
After we've cleaned and tokenized the text, we'll use the documents' tokens to create vectors using Word2Vec. This process consists of two steps:

1. Train a Word2Vec model using the tokens you generated earlier. 
2. Generate a vector per document based on its individual word vectors.

In [None]:
#1. Train a Word2Vec model using the tokens you generated earlier.
model = Word2Vec(sentences=tokenized_docs, workers=1, seed=SEED)


*Create Document Vectors from Word Embedding*

In [None]:
#2. Generate a vector per document based on its individual word vectors
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(7750, 100)

In [None]:
print(vectorized_docs[0])

***Cluster Documents Using (Mini-batches) K-means***


In [None]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

**Defining Clusters:**
Execute mbkmean_clusters providing it with the vectorized documents and the number of clusters

In [None]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=3,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "File_ID":df["File_ID"].values,
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 3
Silhouette coefficient: 0.08
Inertia:49304.69780434224
Silhouette values:
    Cluster 0: Size:2742 | Avg:0.11 | Min:0.00 | Max: 0.24
    Cluster 1: Size:2695 | Avg:0.07 | Min:-0.08 | Max: 0.26
    Cluster 2: Size:2313 | Avg:0.07 | Min:-0.04 | Max: 0.21


In [None]:
df_clusters.head()

Unnamed: 0,File_ID,text,tokens,cluster
0,0x23376,Biological Research Labs\n1016/j cub 2007 03 0...,biological research labs 1016j cub pmid clutto...,2
1,0x21cda,Biological Research Labs\n1111/j 1096-0031 200...,biological research labs 1111j pdf fulltext dy...,2
2,0x22428,Biological Research Labs\n Until the 13th cent...,biological research labs 13th century tops tow...,2
3,0x2105a,Biological Research Labs\n Until the 13th cent...,biological research labs 13th century tops tow...,1
4,0x217e0,Biological Research Labs\n Until the 13th cent...,biological research labs 13th century tops tow...,0


In [None]:
print("Most representative terms per cluster (based on centroids):")
for i in range(3):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: adults species juveniles individuals predators calves seahorses individual populations captivity 
Cluster 1: agriculturecork hennessey bawden newell defenceless descendantsfady mamatas vallelonga setta stahl 
Cluster 2: adults shedding species neutrally mouthparts seed juveniles soil culms happens 


# **Test Data Analysis**


In [None]:
test_data = pd.read_csv('test.csv')
print(test_data.head())
print(test_data.shape)

   File_ID
0  0x23bee
1  0x22f33
2  0x22011
3  0x22f38
4  0x22673
(3000, 1)


***Reading the Test Data PDF Files***

In [None]:
!pip install PyPDF2
import PyPDF2
import warnings
!pip install textract
import textract 
warnings.filterwarnings("ignore",category=DeprecationWarning)
text_list = []
for i in range(test_data.shape[0]):
  file_id = train_data_v1['File_ID'][i]
  file_name = file_id + '.pdf'
  path = '/content/dataset/train/' + file_name
#Lets read our pdf for the manifesto using the PdfFileReader() function from the PyPDF2 which is a package for extracting document information such as title, author, number of pages,
#...., spliting documents page by page, merging page by page, etc.
#filename = '/content/dataset/train/0x205cc.pdf'
  open_path = open(path, 'rb')
  ind_zoology = PyPDF2.PdfFileReader(open_path)
  
  count = 0
  text  = ''

# Lets loop through, to read each page from the pdf file
  while(count < ind_zoology.numPages):
    # Get the specified number of pages in the document
    mani_page  = ind_zoology.getPage(count)
    # Process the next page
    count += 1
    # Extract the text from the page
    text += mani_page.extractText()

  if text != '':
    text = text
    text_list.append(text)
    
  else:
    textract.process(open_path, method='tesseract', encoding='utf-8', langauge='eng' )
    text_list.append(text)
test_data['content'] = np.array(text_list)

In [None]:
test_data.head(3)

Unnamed: 0,File_ID,content
0,0x23bee,Biological Research Labs\n Greens often combin...
1,0x22f33,Biological Research Labs\n Attempts to remove ...
2,0x22011,Biological Research Labs\nA domestic or domest...


***Cleaning and Pre-processing the test data***

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["content"]

df_raw_1 = test_data
df_test = df_raw_1.copy()

for col in text_columns:
    df_test[col] = df_test[col].astype(str)

df_test["text"] = df_test[text_columns].apply(lambda x: " | ".join(x), axis=1)
df_test["tokens"] = df_test["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
#_, idx = np.unique(df_test["tokens"], return_index=True)
#df_test = df_test.iloc[idx, :]

# Remove empty values and keep relevant columns
df_test.drop('content', inplace=True,axis=1)
docs = df_test["text"].values
tokenized_docs = df_test["tokens"].values

print(f"Original dataframe: {df_raw_1.shape}")
print(f"Pre-processed dataframe: {df_test.shape}")
print(df_test.head(3))
#print(docs)
#print(tokens)
print(df_test.shape)

Original dataframe: (3000, 2)
Pre-processed dataframe: (3000, 3)
   File_ID                                               text  \
0  0x23bee  Biological Research Labs\n Greens often combin...   
1  0x22f33  Biological Research Labs\n Attempts to remove ...   
2  0x22011  Biological Research Labs\nA domestic or domest...   

                                              tokens  
0  [biological, research, labs, greens, often, co...  
1  [biological, research, labs, attempts, remove,...  
2  [biological, research, labs, domestic, domesti...  
(3000, 3)


In [None]:
df_test.shape

(3000, 3)

***Generate Document Vectors of test data***

In [None]:
model = Word2Vec(sentences=tokenized_docs, workers=1, seed=SEED)

In [None]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs_test = vectorize(tokenized_docs, model=model)
len(vectorized_docs_test), len(vectorized_docs_test[0])

(3000, 100)

***Cluster Prediction Using the Test Data***

In [None]:
#km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(vectorized_docs_test)
prediction = clustering.fit_predict(vectorized_docs_test)

df_predict = pd.DataFrame({
    "File_ID":df_test["File_ID"].values,
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": prediction
})

df_predict.head()

In [None]:
df_predict.shape

(3000, 4)

In [None]:
print("Most representative terms per cluster (based on centroids):")
for i in range(3):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=20)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: sewed namibiensis defoliations cryptophaps worsened renegotiated yearsuntil nanakshahi pitts kawhi indefensible lopholaimus allonursing naltar calabrese waterhouse disprove compelled bolton locales 
Cluster 1: waterhouse dietthe theaters kawhi evidently scats leyva scutigeromorph sika myrmeleontinae taphonomic craterostigmomorph nonvenomous tapanuli pitts mesechinus bovinae stuntman mitigating teinolophos 
Cluster 2: stuntman kawhi foodin markedly abundantly hissar geophilomorphs putris sagarmatha dietthe evidently waterhouse motile har reproductively neutrally bolton cephalophinae productionthe poker 


***Cluster Tagging***

In [None]:
#approach 1
df_predict.loc[df_predict["cluster"] == 0, "cluster"] = "Plant Species"
df_predict.loc[df_predict["cluster"] == 1, "cluster"] = "Animal Species"
df_predict.loc[df_predict["cluster"] == 2, "cluster"] = "Not Applicable"
df_predict.head(10)

In [None]:
df_predict["Is_Mixedup"] = np.where(df_predict["cluster"] == "Not Applicable", "YES", "NO")
df_predict.head()

Unnamed: 0,File_ID,text,tokens,cluster,Is_Mixedup
0,0x23bee,Biological Research Labs\n Greens often combin...,biological research labs greens often combine ...,Plant Species,NO
1,0x22f33,Biological Research Labs\n Attempts to remove ...,biological research labs attempts remove seeds...,Not Applicable,YES
2,0x22011,Biological Research Labs\nA domestic or domest...,biological research labs domestic domesticated...,Animal Species,NO
3,0x22f38,Biological Research Labs\n Hover flies are als...,biological research labs hover flies also comm...,Animal Species,NO
4,0x22673,"Biological Research Labs\n However, he clearly...",biological research labs however clearly shows...,Plant Species,NO


*Creating the Submission csv*

In [None]:
df_submit = pd.DataFrame({
    "File_ID":df_predict["File_ID"].values,
    "Is_Mixedup":df_predict["Is_Mixedup"].values,
    "Topic":df_predict["cluster"].values})
print(df_submit.head())
print("The dimension of predict data:",df_submit.shape)

   File_ID Is_Mixedup           Topic
0  0x23bee         NO   Plant Species
1  0x22f33        YES  Not Applicable
2  0x22011         NO  Animal Species
3  0x22f38         NO  Animal Species
4  0x22673         NO   Plant Species
The dimension of predict data: (3000, 3)


In [None]:
#Exporting the submission data to a new csv
df_submit.to_csv('Clustering_word2vec_appr1_v1.csv',index=False)