In [None]:
#Import the libraries for classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#Import the libraries for feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#import libraries for generating report
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
#Import Pandas
import pandas as pd
#Import numpy
import numpy as np
# Import tqdm to see the progress bar
from tqdm import tqdm
#import lirbraries for cross validation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

 **Mount the Google drive**

In [None]:
#Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

**Read the training data**

In [None]:
# Read the training data from the drive
train = pd.read_csv('/content/drive/MyDrive/HASOC-IndoAryan/Task1/en_Hasoc2021_train.csv',',', names=['sno','id','text','task_1','task_2'])


FileNotFoundError: ignored

**Remove special characters from the training data**

In [None]:
# Remove the special characters from the training data
text, y_train = train['text'], train['task_1']
spec_chars = ["!",'"',"#","%","&amp","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    train['text'] = train['text'].str.replace(char, ' ')

**Extract the text and the corresponding label**

In [None]:
#Extract the text and the corresponding label
train,y_train=train['text'],train['task_1']

In [None]:
# Print the training text and number of samples
print(train)
print(train.size)

0                                                    text
1        wealth if you made it through this      were ...
2       Technically that s still turning back the cloc...
3        VMBJP  BJP4Bengal  BJP4India  narendramodi  J...
4        krtoprak yigit Soldier of Japan Who has dick ...
                              ...                        
3839     BBCNews Let the dog deal with the wanker once...
3840    India has suffered a lot  That Chinese bastard...
3841    People didn t give 300  seats majority to BJP ...
3842     KanganaTeam This is such a vile  xenophobic a...
3843     30iPpgStmILw0SI  ChinaDaily  ChineseVirus  Wu...
Name: text, Length: 3844, dtype: object
3844


In [None]:
#Print the number of samples in each class
print(y_train.value_counts())

HOF       2501
NOT       1342
task_1       1
Name: task_1, dtype: int64


**Text feature extraction** is the process of taking out a list of words from the text data and then transforming them into a feature set which is usable by a classifier

1.   Count vectorizer
2.   Term frequency- inverse document frequency
3. Sentence transformer

**Count Vectorizer** - is a way to convert a given set of strings into a frequency representation.

Count Vectors can be helpful in understanding the type of text by the frequency of words in it.

**Disadvantages:**

* Its inability in identifying more important and less important words for 
analysis.
* It will just consider words that are abundant in a corpus as the most statistically significant word.
* It also doesn't identify the relationships between words such as linguistic similarity between words.








# **Extract count vectorizer**

In [None]:
# Extract count vectorizer 
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,3))
X_train = vectorizer.fit_transform(train)

# **Split the data into train and test**

In [None]:
from sklearn import model_selection
test_size = 0.20
seed = 7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, test_size = test_size, random_state = seed)

# **Train the model using SVM Classifier**

In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear', C=0.01)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#Summary of the predictions made by the classifier
print("SVM Algorithm")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
#Accuracy score
from sklearn.metrics import accuracy_score
print("ACC: ",accuracy_score(y_pred,y_test))

SVM Algorithm
              precision    recall  f1-score   support

         HOF       0.79      0.86      0.83       486
         NOT       0.72      0.62      0.67       283

    accuracy                           0.77       769
   macro avg       0.76      0.74      0.75       769
weighted avg       0.77      0.77      0.77       769

[[418  68]
 [108 175]]
ACC:  0.7711313394018205


**Cross validation:** is a technique in which we train our model using the subset of the data-set and then evaluate using the complementary subset of the data-set.

The three steps involved in cross-validation are as follows :

* Reserve some portion of sample data-set.
* Using the rest data-set train the model.
* Test the model using the reserve portion of the data-set.

**Cross validation using MLP classifer**


In [None]:
clf = MLPClassifier(n_iter_no_change=10, max_iter=5,hidden_layer_sizes=(128, ))
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.77 (+/- 0.01)




# **Cross validation using K-nearest neighbors**

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.70 (+/- 0.01)


**Cross validation using Random forest classifier**

In [None]:
clf = RandomForestClassifier(n_estimators=1000,verbose=True, n_jobs=-1)
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 722 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   23.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    0.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.0s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    1.7s
[Parall

Accuracy: 0.75 (+/- 0.01)


**Extract Term frequency inverse document frequency (Tf-idf)** - 

Tf-idf is the metric to determine how significant a term is to a text in a  corpus. tf-idf is a weighting system that assigns a weight to each word in a document based on its term frequency (tf) and the reciprocal document frequency (tf) (idf). The words with higher scores of weight are deemed to be more significant.

**fit_transform():** It returns an array of terms along with tf-idf values.

In [None]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,3))
X_train = vectorizer.fit_transform(train)

**Split the data into training an testing**

In [None]:
from sklearn import model_selection
test_size = 0.20
seed = 7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, test_size = test_size, random_state = seed)

**SVM Classifier**

In [None]:
from sklearn import svm
model = svm.SVC(kernel='linear', C=0.01)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

#Summary of the predictions made by the classifier
print("SVM Algorithm")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
#Accuracy score
from sklearn.metrics import accuracy_score
print("ACC: ",accuracy_score(y_pred,y_test))

SVM Algorithm
              precision    recall  f1-score   support

         HOF       0.63      1.00      0.77       486
         NOT       0.00      0.00      0.00       283

    accuracy                           0.63       769
   macro avg       0.32      0.50      0.39       769
weighted avg       0.40      0.63      0.49       769

[[486   0]
 [283   0]]
ACC:  0.6319895968790638


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**MLP classifier**

In [None]:
clf = MLPClassifier(n_iter_no_change=10, max_iter=10,hidden_layer_sizes=(128, ))
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.77 (+/- 0.01)




**K-nearest neighbors classifier**

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.71 (+/- 0.01)


**Install sentence transformers**

SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings.

Pre-trained models
https://www.sbert.net/docs/pretrained_models.html

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 10.8 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 47.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 48.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.6 MB/s 
[?2

**Use Indo-Aryan-XLM-R-Base - transformer model**

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('ashwani-tanwar/Indo-Aryan-XLM-R-Base')
X_train = model.encode(text, batch_size=50,show_progress_bar=True)

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/486 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/ashwani-tanwar_Indo-Aryan-XLM-R-Base were not used when initializing XLMRobertaModel: ['lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/77 [00:00<?, ?it/s]

# **MLP classifier**

In [None]:
clf = MLPClassifier(n_iter_no_change=10, max_iter=100,hidden_layer_sizes=(512, ))
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

# **K-nearest neighbors classifier**

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.70 (+/- 0.01)


# **Use distilbert-base-uncased-finetuned-sst-2-english**

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-uncased-finetuned-sst-2-english')
X_train = model.encode(text, batch_size=50,show_progress_bar=True)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/77 [00:00<?, ?it/s]

**SVM classifier**

In [None]:
clf = SVC(kernel='linear')
#svclassifier.fit(X_train, y_train)
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.75 (+/- 0.01)


**average_word_embeddings_glove.6B.300d** - compute the average word embedding for some well-known word embedding methods. Their computation speed is much higher than the transformer based models

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
X_train = model.encode(text, batch_size=50,show_progress_bar=True)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/164 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/77 [00:00<?, ?it/s]

In [None]:
clf = SVC(kernel='linear')
#svclassifier.fit(X_train, y_train)
res = cross_validate(clf, X_train, y_train, cv=5,  return_train_score=True) 
scores = cross_val_score(clf,X_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))



Accuracy: 0.74 (+/- 0.01)
