# Importing necessary libraries and modules



This block imports the libraries and modules required for data processing and others.


* listdir and isfile from the os module
* csv
* matplotlib
* numpy
* pandas
* sklearn (for machine learning)
* Word2Vec from the gensim module.

<p>
Initializing data structures: This block initializes two empty lists rowsx and yx.
</p>

In [1]:
from os import listdir
from os.path import isfile, join
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from os import walk
import pandas as pd
from gensim.models import Word2Vec
from sklearn.preprocessing import scale
from collections import Counter
rowsx = []
yx = []



<p>
<h2>Reading data from the training dataset:</h2> 

* This block opens the train.csv file and reads each row. 
* It extracts the first column as yx and the rest of the columns as rows1. 
* It splits each column on the newline character (\n) and appends the resulting strings to rows1.
* It then appends rows1 to rowsx.
</p>

In [2]:
with open("./data/train.csv", 'r', encoding='latin1') as csv1:
    # creating a csv reader object
    csvreader1 = csv.reader(csv1)
    # extracting each data row one by one
    for row in csvreader1:
        rows1 = []
        for i in range(1, len(row)-15):
            for j in row[i].split("\n"):
                    rows1.append(j)
        yx.append(row[0])
        del (row[0])
        rowsx.append(rows1)

print("done")


done


# Word2Vec Embeddings: 

* This block uses the Word2Vec class from the gensim module to create word embeddings for each sentence in rowsx.
* It sets the embedding dimension to 200 and the minimum count to 3, and trains the embeddings using the sentences in rowsx.

# TF-IDF Vectorization: 

* This block uses the TfidfVectorizer class from the sklearn module to generate a TF-IDF (term frequency-inverse document frequency) matrix for the sentences in rowsx.
* It sets the minimum document frequency to 3 and fits the vectorizer to the sentences in rowsx. 
* It also creates a dictionary tfidf_map that maps each word to its corresponding IDF value.



In [3]:
embeddings = Word2Vec(vector_size=200, min_count=3)
embeddings.build_vocab([sentence for sentence in rowsx])
embeddings.train([sentence for sentence in rowsx],
                 total_examples=embeddings.corpus_count,
                 epochs=embeddings.epochs)

gen_tfidf = TfidfVectorizer(analyzer=lambda x: x, min_df=3)
matrix = gen_tfidf.fit_transform([sentence   for sentence in rowsx])
tfidf_map = dict(zip(gen_tfidf.get_feature_names(), gen_tfidf.idf_))

print(len(tfidf_map))

16537




# Sentence Encoding: 

* This block defines a function encode_sentence that takes in a list of tokens (words) and the embedding size (200 in this case), and encodes the sentence using the Word2Vec embeddings and the TF-IDF values for each word.
* If a word is not present in the embeddings, it is ignored. 
* The function returns a vector representation of the sentence.

In [4]:
def encode_sentence(tokens, emb_size):
    _vector = np.zeros((1, emb_size))
    length = 0
    for word in tokens:
        try:
            _vector += embeddings.wv[word].reshape((1, emb_size)) * tfidf_map[word]
            length += 1
        except KeyError:
            continue
        break

    if length > 0:
        _vector /= length

    return _vector


# Encoding the Training Data: 


* This block encodes each sentence in rowsx using the encode_sentence function and concatenates the resulting vectors into a single matrix x_train.
* It then standardizes the matrix using the scale function from sklearn.

In [5]:
x_train = scale(np.concatenate([encode_sentence(ele, 200) for ele in map(lambda x: x, rowsx)]))

print(x_train.shape)

(474824, 200)


# K-Nearest Neighbors (KNN) Classifier:

* This block creates a KNN classifier using the KNeighborsClassifier class from sklearn, with n_neighbors set to 30. 
* It trains the classifier on the standardized training data (x_train) and the corresponding labels (yx), and prints the message "done" when the operation is completed.

In [6]:
modelknn = KNeighborsClassifier(n_neighbors=30)
modelknn.fit(x_train,yx)
print("done")

done


# Reading data from the test dataset: 

This block reads the sample_test.csv file and processes it in the same way as the training data (excluding the first column, which contains the labels).

# Encoding the Test Data: 

This block encodes each sentence in rowsx1 using the encode_sentence function and concatenates the resulting vectors into a single matrix x_test. It then standardizes the matrix using the scale function from sklearn.

# Predicting labels:

* This block predicts class labels for test data using KNN, counts the number of predictions for each class label, and sorts the counts in descending order.
* It stores the predicted labels and count of each label in predicted_labels_knn and counts respectively. 
* The sorted list of counts, including the label and count, is stored in listcount.
* Finally, it prints the counts dictionary.

In [7]:
rowsx1 = []
with open("./data/sample_test.csv", 'r', encoding='latin1') as csv1:
    csvreader1 = csv.reader(csv1)
    # extracting each data row one by one
    for row in csvreader1:
        rows1 = []
        for i in range(1, len(row)-15):
            for j in row[i].split("\n"):
                rows1.append(j)
        del (row[0])
        rowsx1.append(rows1)
x_test = scale(np.concatenate([encode_sentence(ele, 200) for ele in map(lambda x: x, rowsx1)]))
predicted_labels_knn = modelknn.predict(x_test)
counts = Counter(predicted_labels_knn)
listcount = [(l, k) for k, l in sorted([(j, i) for i, j in counts.items()], reverse=True)]
print(counts)

Counter({'2391': 29, '1026541': 10, '2554': 7, '772386': 7, '12189': 6, '1159221': 5, '27778221': 5, '42902132': 5, '10350': 5, '10297': 5, '3359851': 4, '1075': 4, '14227633': 4, '12111': 4, '1273081': 4, '12211342': 4, '985': 4, '611823': 4, '1520661': 3, '10021642': 3, '246': 3, '39501385': 3, '48763': 3, '1373201': 3, '14080395': 3, '730073': 3, '3840': 3, '3011091': 3, '50518441': 3, '6327412': 3, '10487982': 3, '12389252': 3, '3452911': 3, '12765532': 3, '40956484': 3, '11470302': 3, '14237044': 3, '1814': 3, '784492': 3, '14072428': 3, '4737': 3, '235': 3, '14105596': 3, '12336822': 3, '813286': 3, '13050': 3, '11994112': 2, '30332372': 2, '1285021': 2, '14522707': 2, '62763': 2, '5434332': 2, '1797691': 2, '3043001': 2, '11537': 2, '1119751': 2, '6144362': 2, '1621271': 2, '365': 2, '1469': 2, '14052380': 2, '2053': 2, '1618221': 2, '1085': 2, '8291': 2, '11078052': 2, '5307': 2, '2774': 2, '24053': 2, '31253': 2, '10283882': 2, '54433665': 2, '44539471': 2, '1319451': 2, '1668

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
